This source file includes following definitions.
- orte_plm_base_set_slots
- orte_plm_base_daemons_reported
- orte_plm_base_allocation_complete
- orte_plm_base_daemons_launched
- files_ready
- orte_plm_base_vm_ready
- orte_plm_base_mapping_complete
- orte_plm_base_setup_job
- orte_plm_base_setup_job_complete
- orte_plm_base_complete_setup
- timer_cb
- orte_plm_base_launch_apps
- orte_plm_base_send_launch_msg
- orte_plm_base_post_launch
- orte_plm_base_registered
- orte_plm_base_daemon_topology
- orte_plm_base_daemon_callback
- orte_plm_base_daemon_failed
- orte_plm_base_setup_orted_cmd
- orte_plm_base_orted_append_basic_args
- orte_plm_base_setup_virtual_machine
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 #include "orte_config.h"
  29 #include "orte/constants.h"
  30 
  31 #ifdef HAVE_SYS_WAIT_H
  32 #include <sys/wait.h>
  33 #endif
  34 #ifdef HAVE_SYS_TIME_H
  35 #include <sys/time.h>
  36 #endif  
  37 #include <ctype.h>
  38 
  39 #include "opal/hash_string.h"
  40 #include "opal/util/argv.h"
  41 #include "opal/util/opal_environ.h"
  42 #include "opal/util/printf.h"
  43 #include "opal/class/opal_pointer_array.h"
  44 #include "opal/dss/dss.h"
  45 #include "opal/mca/hwloc/hwloc-internal.h"
  46 #include "opal/mca/pmix/pmix.h"
  47 #include "opal/mca/compress/compress.h"
  48 
  49 #include "orte/util/dash_host/dash_host.h"
  50 #include "orte/util/nidmap.h"
  51 #include "orte/util/session_dir.h"
  52 #include "orte/util/show_help.h"
  53 #include "orte/mca/errmgr/errmgr.h"
  54 #include "orte/mca/ess/ess.h"
  55 #include "orte/mca/iof/base/base.h"
  56 #include "orte/mca/odls/base/base.h"
  57 #include "orte/mca/ras/base/base.h"
  58 #include "orte/mca/rmaps/rmaps.h"
  59 #include "orte/mca/rmaps/base/base.h"
  60 #include "orte/mca/rml/rml.h"
  61 #include "orte/mca/rml/rml_types.h"
  62 #include "orte/mca/routed/routed.h"
  63 #include "orte/mca/grpcomm/base/base.h"
  64 #if OPAL_ENABLE_FT_CR == 1
  65 #include "orte/mca/snapc/base/base.h"
  66 #endif
  67 #include "orte/mca/filem/filem.h"
  68 #include "orte/mca/filem/base/base.h"
  69 #include "orte/mca/grpcomm/base/base.h"
  70 #include "orte/mca/rml/base/rml_contact.h"
  71 #include "orte/mca/rtc/rtc.h"
  72 #include "orte/runtime/orte_globals.h"
  73 #include "orte/runtime/runtime.h"
  74 #include "orte/runtime/orte_locks.h"
  75 #include "orte/runtime/orte_quit.h"
  76 #include "orte/util/name_fns.h"
  77 #include "orte/util/pre_condition_transports.h"
  78 #include "orte/util/proc_info.h"
  79 #include "orte/util/threads.h"
  80 #include "orte/mca/state/state.h"
  81 #include "orte/mca/state/base/base.h"
  82 #include "orte/util/hostfile/hostfile.h"
  83 #include "orte/mca/odls/odls_types.h"
  84 
  85 #include "orte/mca/plm/base/plm_private.h"
  86 #include "orte/mca/plm/base/base.h"
  87 
  88 void orte_plm_base_set_slots(orte_node_t *node)
  89 {
  90     if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
  91         if (NULL != node->topology && NULL != node->topology->topo) {
  92             node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
  93                                                              HWLOC_OBJ_CORE, 0,
  94                                                              OPAL_HWLOC_LOGICAL);
  95         }
  96     } else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
  97         if (NULL != node->topology && NULL != node->topology->topo) {
  98             if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
  99                                                                        HWLOC_OBJ_SOCKET, 0,
 100                                                                        OPAL_HWLOC_LOGICAL))) {
 101                 
 102 
 103                 node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 104                                                                  HWLOC_OBJ_NODE, 0,
 105                                                                  OPAL_HWLOC_LOGICAL);
 106             }
 107         }
 108     } else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
 109         if (NULL != node->topology && NULL != node->topology->topo) {
 110             node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 111                                                              HWLOC_OBJ_NODE, 0,
 112                                                              OPAL_HWLOC_LOGICAL);
 113         }
 114     } else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
 115         if (NULL != node->topology && NULL != node->topology->topo) {
 116             node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 117                                                              HWLOC_OBJ_PU, 0,
 118                                                              OPAL_HWLOC_LOGICAL);
 119         }
 120     } else {
 121         
 122         node->slots = strtol(orte_set_slots, NULL, 10);
 123     }
 124     
 125     ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
 126 }
 127 
 128 void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
 129 {
 130     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 131     orte_topology_t *t;
 132     orte_node_t *node;
 133     int i, rc;
 134     uint8_t u8;
 135     opal_buffer_t buf;
 136     orte_grpcomm_signature_t *sig;
 137     orte_daemon_cmd_flag_t command = ORTE_DAEMON_PASS_NODE_INFO_CMD;
 138 
 139     ORTE_ACQUIRE_OBJECT(caddy);
 140 
 141     
 142 
 143     if (orte_do_not_launch) {
 144         node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
 145         t = node->topology;
 146         for (i=1; i < orte_node_pool->size; i++) {
 147             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
 148                 continue;
 149             }
 150             if (NULL == node->topology) {
 151                 node->topology = t;
 152             }
 153         }
 154     }
 155 
 156     
 157 
 158 
 159     if (!orte_managed_allocation) {
 160         if (NULL != orte_set_slots &&
 161             0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
 162             caddy->jdata->total_slots_alloc = 0;
 163             for (i=0; i < orte_node_pool->size; i++) {
 164                 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
 165                     continue;
 166                 }
 167                 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 168                     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 169                                          "%s plm:base:setting slots for node %s by %s",
 170                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
 171                     orte_plm_base_set_slots(node);
 172                 }
 173                 caddy->jdata->total_slots_alloc += node->slots;
 174             }
 175         }
 176     }
 177 
 178     if (orte_display_allocation) {
 179         orte_ras_base_display_alloc();
 180     }
 181     
 182     orte_routed.update_routing_plan();
 183 
 184     
 185     OBJ_CONSTRUCT(&buf, opal_buffer_t);
 186     
 187     if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) {
 188         ORTE_ERROR_LOG(rc);
 189         OBJ_DESTRUCT(&buf);
 190         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 191         OBJ_RELEASE(caddy);
 192         return;
 193     }
 194 
 195 
 196     
 197 
 198 
 199     if (!orte_nidmap_communicated) {
 200         u8 = 1;
 201         if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &u8, 1, OPAL_UINT8))) {
 202             ORTE_ERROR_LOG(rc);
 203             OBJ_DESTRUCT(&buf);
 204             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 205             OBJ_RELEASE(caddy);
 206             return;
 207         }
 208         if (OPAL_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &buf))) {
 209             ORTE_ERROR_LOG(rc);
 210             OBJ_DESTRUCT(&buf);
 211             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 212             OBJ_RELEASE(caddy);
 213             return;
 214         }
 215         orte_nidmap_communicated = true;
 216     } else {
 217         u8 = 0;
 218         if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &u8, 1, OPAL_UINT8))) {
 219             ORTE_ERROR_LOG(rc);
 220             OBJ_DESTRUCT(&buf);
 221             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 222             OBJ_RELEASE(caddy);
 223             return;
 224         }
 225     }
 226 
 227     
 228 
 229 
 230     if (OPAL_SUCCESS != (rc = orte_util_pass_node_info(&buf))) {
 231         ORTE_ERROR_LOG(rc);
 232         OBJ_DESTRUCT(&buf);
 233         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 234         OBJ_RELEASE(caddy);
 235         return;
 236     }
 237 
 238     
 239     sig = OBJ_NEW(orte_grpcomm_signature_t);
 240     sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
 241     sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
 242     sig->signature[0].vpid = ORTE_VPID_WILDCARD;
 243     sig->sz = 1;
 244     if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &buf))) {
 245         ORTE_ERROR_LOG(rc);
 246         OBJ_RELEASE(sig);
 247         OBJ_DESTRUCT(&buf);
 248         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 249         OBJ_RELEASE(caddy);
 250         return;
 251     }
 252     OBJ_DESTRUCT(&buf);
 253     
 254     OBJ_RELEASE(sig);
 255 
 256     
 257     caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
 258     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY);
 259 
 260     
 261     OBJ_RELEASE(caddy);
 262 }
 263 
 264 void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
 265 {
 266     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 267 
 268     ORTE_ACQUIRE_OBJECT(caddy);
 269 
 270     
 271 
 272 
 273     if (orte_do_not_launch) {
 274         caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
 275         ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
 276     } else {
 277         
 278         caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
 279         ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
 280     }
 281 
 282     
 283     OBJ_RELEASE(caddy);
 284 }
 285 
 286 void orte_plm_base_daemons_launched(int fd, short args, void *cbdata)
 287 {
 288     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 289 
 290     ORTE_ACQUIRE_OBJECT(caddy);
 291 
 292     
 293 
 294 
 295 
 296     
 297     OBJ_RELEASE(caddy);
 298 }
 299 
 300 static void files_ready(int status, void *cbdata)
 301 {
 302     orte_job_t *jdata = (orte_job_t*)cbdata;
 303 
 304     if (ORTE_SUCCESS != status) {
 305         ORTE_FORCED_TERMINATE(status);
 306     } else {
 307         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 308     }
 309 }
 310 
 311 void orte_plm_base_vm_ready(int fd, short args, void *cbdata)
 312 {
 313     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 314 
 315     ORTE_ACQUIRE_OBJECT(caddy);
 316 
 317     
 318     caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
 319 
 320     
 321     if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
 322         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 323     }
 324 
 325     
 326     OBJ_RELEASE(caddy);
 327 }
 328 
 329 void orte_plm_base_mapping_complete(int fd, short args, void *cbdata)
 330 {
 331     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 332 
 333     ORTE_ACQUIRE_OBJECT(caddy);
 334 
 335     
 336     caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
 337     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
 338 
 339     
 340     OBJ_RELEASE(caddy);
 341 }
 342 
 343 
 344 void orte_plm_base_setup_job(int fd, short args, void *cbdata)
 345 {
 346     int rc;
 347     int i;
 348     orte_app_context_t *app;
 349     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 350     char *key;
 351     orte_job_t *parent;
 352     orte_process_name_t name, *nptr;
 353 
 354     ORTE_ACQUIRE_OBJECT(caddy);
 355 
 356     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 357                          "%s plm:base:setup_job",
 358                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 359 
 360     if (ORTE_JOB_STATE_INIT != caddy->job_state) {
 361         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 362         OBJ_RELEASE(caddy);
 363         return;
 364     }
 365     
 366     caddy->jdata->state = caddy->job_state;
 367 
 368     
 369     if (ORTE_JOBID_INVALID == caddy->jdata->jobid) {
 370         if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(caddy->jdata))) {
 371             ORTE_ERROR_LOG(rc);
 372             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 373             OBJ_RELEASE(caddy);
 374             return;
 375         }
 376 
 377         
 378 
 379 
 380 
 381 
 382         opal_hash_table_set_value_uint32(orte_job_data, caddy->jdata->jobid, caddy->jdata);
 383     }
 384 
 385     
 386     if (!ORTE_FLAG_TEST(caddy->jdata, ORTE_JOB_FLAG_RECOVERABLE) &&
 387         orte_enable_recovery) {
 388         ORTE_FLAG_SET(caddy->jdata, ORTE_JOB_FLAG_RECOVERABLE);
 389     }
 390 
 391     
 392 
 393 
 394 
 395 
 396 
 397      nptr = &name;
 398      if (orte_get_attribute(&caddy->jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&nptr, OPAL_NAME)) {
 399         
 400         if (NULL == (parent = orte_get_job_data_object(name.jobid))) {
 401             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 402             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 403             OBJ_RELEASE(caddy);
 404             return;
 405         }
 406         
 407 
 408         key = NULL;
 409         if (orte_get_attribute(&parent->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&key, OPAL_STRING) &&
 410             NULL != key) {
 411             
 412             orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, key, OPAL_STRING);
 413             
 414             for (i=0; i < caddy->jdata->apps->size; i++) {
 415                 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
 416                     continue;
 417                 }
 418                 opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &app->env);
 419             }
 420             free(key);
 421         } else {
 422             if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
 423                 ORTE_ERROR_LOG(rc);
 424                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 425                 OBJ_RELEASE(caddy);
 426                 return;
 427             }
 428         }
 429     } else {
 430         
 431 
 432         if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
 433             ORTE_ERROR_LOG(rc);
 434             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 435             OBJ_RELEASE(caddy);
 436             return;
 437         }
 438     }
 439 
 440     
 441     for (i=0; i < caddy->jdata->apps->size; i++) {
 442         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
 443             continue;
 444         }
 445         if (!orte_get_attribute(&app->attributes, ORTE_APP_RECOV_DEF, NULL, OPAL_BOOL)) {
 446             orte_set_attribute(&app->attributes, ORTE_APP_MAX_RESTARTS, ORTE_ATTR_LOCAL, &orte_max_restarts, OPAL_INT32);
 447         }
 448     }
 449 
 450     
 451     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);
 452 
 453     
 454     OBJ_RELEASE(caddy);
 455 }
 456 
 457 void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
 458 {
 459     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 460 
 461     ORTE_ACQUIRE_OBJECT(caddy);
 462 
 463     
 464     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
 465     OBJ_RELEASE(caddy);
 466 }
 467 
 468 void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
 469 {
 470     orte_job_t *jdata, *jdatorted;
 471     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 472     orte_node_t *node;
 473     uint32_t h;
 474     orte_vpid_t *vptr;
 475     int i, rc;
 476     char *serial_number;
 477     orte_process_name_t requestor, *rptr;
 478 
 479     ORTE_ACQUIRE_OBJECT(caddy);
 480 
 481     opal_output_verbose(5, orte_plm_base_framework.framework_output,
 482                         "%s complete_setup on job %s",
 483                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 484                         ORTE_JOBID_PRINT(caddy->jdata->jobid));
 485 
 486     
 487     if (ORTE_JOB_STATE_SYSTEM_PREP != caddy->job_state) {
 488         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 489         OBJ_RELEASE(caddy);
 490         return;
 491     }
 492     
 493     caddy->jdata->state = caddy->job_state;
 494 
 495     
 496     if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
 497         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 498         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 499         OBJ_RELEASE(caddy);
 500         return;
 501     }
 502 
 503     
 504     jdata = caddy->jdata;
 505 
 506     
 507 
 508 
 509 
 510 
 511 
 512 
 513 
 514     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL, NULL, OPAL_BOOL)) {
 515         
 516         rptr = &requestor;
 517         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&rptr, OPAL_NAME)) {
 518             ORTE_IOF_PROXY_PULL(jdata, rptr);
 519         } else {
 520             ORTE_IOF_PROXY_PULL(jdata, &jdata->originator);
 521         }
 522         
 523 
 524     }
 525 
 526     
 527 
 528 
 529 
 530 
 531 
 532     if (orte_coprocessors_detected) {
 533         
 534         for (i=0; i < orte_node_pool->size; i++) {
 535             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
 536                 continue;
 537             }
 538             
 539             serial_number = NULL;
 540             if (!orte_get_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, (void**)&serial_number, OPAL_STRING)) {
 541                 continue;
 542             }
 543             if (NULL != serial_number) {
 544                 
 545 
 546 
 547                 OPAL_HASH_STR(serial_number, h);
 548                 free(serial_number);
 549                 if (OPAL_SUCCESS != (rc = opal_hash_table_get_value_uint32(orte_coprocessors, h,
 550                                                                            (void**)&vptr))) {
 551                     ORTE_ERROR_LOG(rc);
 552                     break;
 553                 }
 554                 orte_set_attribute(&node->attributes, ORTE_NODE_HOSTID, ORTE_ATTR_LOCAL, vptr, ORTE_VPID);
 555             }
 556         }
 557     }
 558     
 559     if (NULL != orte_coprocessors) {
 560         OBJ_RELEASE(orte_coprocessors);
 561     }
 562 
 563     
 564     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS);
 565 
 566     
 567     OBJ_RELEASE(caddy);
 568 }
 569 
 570 
 571 static void timer_cb(int fd, short event, void *cbdata)
 572 {
 573     orte_job_t *jdata = (orte_job_t*)cbdata;
 574     orte_timer_t *timer=NULL;
 575 
 576     ORTE_ACQUIRE_OBJECT(jdata);
 577 
 578     
 579     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
 580 
 581     
 582     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, (void**)&timer, OPAL_PTR)) {
 583         
 584         OBJ_RELEASE(timer);
 585         orte_remove_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
 586     }
 587 }
 588 
 589 void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
 590 {
 591     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 592     orte_job_t *jdata;
 593     orte_daemon_cmd_flag_t command;
 594     int rc;
 595 
 596     ORTE_ACQUIRE_OBJECT(caddy);
 597 
 598     
 599     jdata = caddy->jdata;
 600 
 601     if (ORTE_JOB_STATE_LAUNCH_APPS != caddy->job_state) {
 602         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 603         OBJ_RELEASE(caddy);
 604         return;
 605     }
 606     
 607     caddy->jdata->state = caddy->job_state;
 608 
 609     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 610                          "%s plm:base:launch_apps for job %s",
 611                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 612                          ORTE_JOBID_PRINT(jdata->jobid)));
 613 
 614     
 615     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
 616         command = ORTE_DAEMON_DVM_ADD_PROCS;
 617     } else {
 618         command = ORTE_DAEMON_ADD_LOCAL_PROCS;
 619     }
 620     if (ORTE_SUCCESS != (rc = opal_dss.pack(&jdata->launch_msg, &command, 1, ORTE_DAEMON_CMD))) {
 621         ORTE_ERROR_LOG(rc);
 622         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 623         OBJ_RELEASE(caddy);
 624         return;
 625     }
 626 
 627     
 628     if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(&jdata->launch_msg, jdata->jobid))) {
 629         ORTE_ERROR_LOG(rc);
 630         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 631     }
 632 
 633     OBJ_RELEASE(caddy);
 634     return;
 635 }
 636 
 637 void orte_plm_base_send_launch_msg(int fd, short args, void *cbdata)
 638 {
 639     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 640     orte_timer_t *timer;
 641     orte_grpcomm_signature_t *sig;
 642     orte_job_t *jdata;
 643     int rc;
 644 
 645     
 646     jdata = caddy->jdata;
 647 
 648     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 649                          "%s plm:base:send launch msg for job %s",
 650                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 651                          ORTE_JOBID_PRINT(jdata->jobid)));
 652 
 653     
 654     if (orte_do_not_launch) {
 655         bool compressed;
 656         uint8_t *cmpdata;
 657         size_t cmplen;
 658         
 659         compressed = opal_compress.compress_block((uint8_t*)jdata->launch_msg.base_ptr,
 660                                               jdata->launch_msg.bytes_used,
 661                                               &cmpdata, &cmplen);
 662         if (compressed) {
 663             opal_output(0, "LAUNCH MSG RAW SIZE: %d COMPRESSED SIZE: %d",
 664                         (int)jdata->launch_msg.bytes_used, (int)cmplen);
 665             free(cmpdata);
 666         } else {
 667             opal_output(0, "LAUNCH MSG RAW SIZE: %d", (int)jdata->launch_msg.bytes_used);
 668         }
 669         orte_never_launched = true;
 670         ORTE_FORCED_TERMINATE(0);
 671         OBJ_RELEASE(caddy);
 672         return;
 673     }
 674 
 675     
 676     sig = OBJ_NEW(orte_grpcomm_signature_t);
 677     sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
 678     sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
 679     sig->signature[0].vpid = ORTE_VPID_WILDCARD;
 680     sig->sz = 1;
 681     if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &jdata->launch_msg))) {
 682         ORTE_ERROR_LOG(rc);
 683         OBJ_RELEASE(sig);
 684         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 685         OBJ_RELEASE(caddy);
 686         return;
 687     }
 688     OBJ_DESTRUCT(&jdata->launch_msg);
 689     OBJ_CONSTRUCT(&jdata->launch_msg, opal_buffer_t);
 690     
 691     OBJ_RELEASE(sig);
 692 
 693     
 694 
 695 
 696     caddy->jdata->num_daemons_reported++;
 697 
 698     
 699 
 700 
 701     if (0 < orte_startup_timeout) {
 702         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 703                              "%s plm:base:launch defining timeout for job %s",
 704                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 705                              ORTE_JOBID_PRINT(jdata->jobid)));
 706         timer = OBJ_NEW(orte_timer_t);
 707         timer->payload = jdata;
 708         opal_event_evtimer_set(orte_event_base,
 709                                timer->ev, timer_cb, jdata);
 710         opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
 711         timer->tv.tv_sec = orte_startup_timeout;
 712         timer->tv.tv_usec = 0;
 713         orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
 714         ORTE_POST_OBJECT(timer);
 715         opal_event_evtimer_add(timer->ev, &timer->tv);
 716     }
 717 
 718     
 719     OBJ_RELEASE(caddy);
 720 }
 721 
 722 void orte_plm_base_post_launch(int fd, short args, void *cbdata)
 723 {
 724     int32_t rc;
 725     orte_job_t *jdata;
 726     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 727     orte_process_name_t name;
 728     orte_timer_t *timer=NULL;
 729     int ret;
 730     opal_buffer_t *answer;
 731     int room, *rmptr;
 732 
 733     ORTE_ACQUIRE_OBJECT(caddy);
 734 
 735     
 736     jdata = caddy->jdata;
 737 
 738     
 739     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, (void**)&timer, OPAL_PTR)) {
 740         opal_event_evtimer_del(timer->ev);
 741         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 742                              "%s plm:base:launch deleting timeout for job %s",
 743                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 744                              ORTE_JOBID_PRINT(jdata->jobid)));
 745         OBJ_RELEASE(timer);
 746         orte_remove_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
 747     }
 748 
 749     if (ORTE_JOB_STATE_RUNNING != caddy->job_state) {
 750         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 751         OBJ_RELEASE(caddy);
 752         return;
 753     }
 754     
 755     caddy->jdata->state = caddy->job_state;
 756 
 757     
 758     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 759                          "%s plm:base:launch wiring up iof for job %s",
 760                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 761                          ORTE_JOBID_PRINT(jdata->jobid)));
 762 
 763     
 764     name.jobid = jdata->jobid;
 765     name.vpid = jdata->stdin_target;
 766 
 767     if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
 768         ORTE_ERROR_LOG(rc);
 769         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 770         OBJ_RELEASE(caddy);
 771         return;
 772     }
 773 
 774     
 775     if (ORTE_JOBID_INVALID == jdata->originator.jobid) {
 776         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 777                              "%s plm:base:launch job %s is not a dynamic spawn",
 778                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 779                              ORTE_JOBID_PRINT(jdata->jobid)));
 780         goto cleanup;
 781     }
 782 
 783     
 784     rc = ORTE_SUCCESS;
 785     answer = OBJ_NEW(opal_buffer_t);
 786     
 787     if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
 788         ORTE_ERROR_LOG(ret);
 789         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 790         OBJ_RELEASE(caddy);
 791         return;
 792     }
 793     
 794     if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
 795         ORTE_ERROR_LOG(ret);
 796         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 797         OBJ_RELEASE(caddy);
 798         return;
 799     }
 800     
 801     rmptr = &room;
 802     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
 803         if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
 804             ORTE_ERROR_LOG(ret);
 805             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 806             OBJ_RELEASE(caddy);
 807             return;
 808         }
 809     }
 810     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 811                          "%s plm:base:launch sending dyn release of job %s to %s",
 812                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 813                          ORTE_JOBID_PRINT(jdata->jobid),
 814                          ORTE_NAME_PRINT(&jdata->originator)));
 815     if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
 816                                            ORTE_RML_TAG_LAUNCH_RESP,
 817                                            orte_rml_send_callback, NULL))) {
 818         ORTE_ERROR_LOG(ret);
 819         OBJ_RELEASE(answer);
 820         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 821         OBJ_RELEASE(caddy);
 822         return;
 823     }
 824 
 825   cleanup:
 826     
 827     OBJ_RELEASE(caddy);
 828 }
 829 
 830 void orte_plm_base_registered(int fd, short args, void *cbdata)
 831 {
 832     orte_job_t *jdata;
 833     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 834 
 835     ORTE_ACQUIRE_OBJECT(caddy);
 836 
 837     
 838     jdata = caddy->jdata;
 839 
 840     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 841                          "%s plm:base:launch %s registered",
 842                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 843                          ORTE_JOBID_PRINT(jdata->jobid)));
 844 
 845     if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) {
 846         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 847                              "%s plm:base:launch job %s not registered - state %s",
 848                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 849                              ORTE_JOBID_PRINT(jdata->jobid),
 850                              orte_job_state_to_str(caddy->job_state)));
 851         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 852         OBJ_RELEASE(caddy);
 853         return;
 854     }
 855     
 856     jdata->state = caddy->job_state;
 857 
 858    
 859     if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
 860         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
 861     }
 862 
 863     OBJ_RELEASE(caddy);
 864 }
 865 
 866 
 867 static bool orted_failed_launch;
 868 static orte_job_t *jdatorted=NULL;
 869 
 870 
 871 void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
 872                                    opal_buffer_t *buffer,
 873                                    orte_rml_tag_t tag, void *cbdata)
 874 {
 875     hwloc_topology_t topo;
 876     int rc, idx;
 877     char *sig, *coprocessors, **sns;
 878     orte_proc_t *daemon=NULL;
 879     orte_topology_t *t, *t2;
 880     int i;
 881     uint32_t h;
 882     orte_job_t *jdata;
 883     uint8_t flag;
 884     size_t inlen, cmplen;
 885     uint8_t *packed_data, *cmpdata;
 886     opal_buffer_t datbuf, *data;
 887 
 888     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 889                          "%s plm:base:daemon_topology recvd for daemon %s",
 890                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 891                          ORTE_NAME_PRINT(sender)));
 892 
 893     
 894     if (NULL == jdatorted) {
 895         jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 896     }
 897     if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, sender->vpid))) {
 898         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 899         orted_failed_launch = true;
 900         goto CLEANUP;
 901     }
 902     OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
 903     
 904     idx=1;
 905     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) {
 906         ORTE_ERROR_LOG(rc);
 907         orted_failed_launch = true;
 908         goto CLEANUP;
 909     }
 910     if (flag) {
 911         
 912         idx=1;
 913         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) {
 914             ORTE_ERROR_LOG(rc);
 915             orted_failed_launch = true;
 916             goto CLEANUP;
 917         }
 918         
 919         idx=1;
 920         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) {
 921             ORTE_ERROR_LOG(rc);
 922             orted_failed_launch = true;
 923             goto CLEANUP;
 924         }
 925         
 926         packed_data = (uint8_t*)malloc(inlen);
 927         
 928         idx = inlen;
 929         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) {
 930             ORTE_ERROR_LOG(rc);
 931             orted_failed_launch = true;
 932             goto CLEANUP;
 933         }
 934         
 935         if (opal_compress.decompress_block(&cmpdata, cmplen,
 936                                        packed_data, inlen)) {
 937             
 938             opal_dss.load(&datbuf, cmpdata, cmplen);
 939             data = &datbuf;
 940         } else {
 941             data = buffer;
 942         }
 943         free(packed_data);
 944     } else {
 945         data = buffer;
 946     }
 947 
 948     
 949     idx=1;
 950     if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &sig, &idx, OPAL_STRING))) {
 951         ORTE_ERROR_LOG(rc);
 952         orted_failed_launch = true;
 953         goto CLEANUP;
 954     }
 955     
 956     t = NULL;
 957     for (i=0; i < orte_node_topologies->size; i++) {
 958         if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, i))) {
 959             continue;
 960         }
 961         
 962         if (0 == strcmp(sig, t2->sig)) {
 963             t = t2;
 964             break;
 965         }
 966     }
 967     if (NULL == t) {
 968         
 969         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 970         orted_failed_launch = true;
 971         goto CLEANUP;
 972     }
 973 
 974     
 975     idx=1;
 976     if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) {
 977         ORTE_ERROR_LOG(rc);
 978         orted_failed_launch = true;
 979         goto CLEANUP;
 980     }
 981     
 982     t->topo = topo;
 983 
 984     
 985     idx=1;
 986     if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) {
 987         ORTE_ERROR_LOG(rc);
 988         orted_failed_launch = true;
 989         goto CLEANUP;
 990     }
 991     if (NULL != coprocessors) {
 992         
 993         if (NULL == orte_coprocessors) {
 994             orte_coprocessors = OBJ_NEW(opal_hash_table_t);
 995             opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
 996         }
 997         
 998 
 999 
1000         sns = opal_argv_split(coprocessors, ',');
1001         for (idx=0; NULL != sns[idx]; idx++) {
1002             
1003             OPAL_HASH_STR(sns[idx], h);
1004             
1005             opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&daemon->name.vpid);
1006         }
1007         opal_argv_free(sns);
1008         free(coprocessors);
1009         orte_coprocessors_detected = true;
1010     }
1011     
1012     idx=1;
1013     if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) {
1014         ORTE_ERROR_LOG(rc);
1015         orted_failed_launch = true;
1016         goto CLEANUP;
1017     }
1018     if (NULL != coprocessors) {
1019         if (orte_get_attribute(&daemon->node->attributes, ORTE_NODE_SERIAL_NUMBER, NULL, OPAL_STRING)) {
1020             
1021 
1022 
1023             ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
1024             orted_failed_launch = true;
1025             free(coprocessors);
1026             goto CLEANUP;
1027         }
1028         orte_set_attribute(&daemon->node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING);
1029         free(coprocessors);
1030         orte_coprocessors_detected = true;
1031     }
1032 
1033   CLEANUP:
1034     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1035                          "%s plm:base:orted:report_topo launch %s for daemon %s",
1036                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1037                          orted_failed_launch ? "failed" : "completed",
1038                          ORTE_NAME_PRINT(sender)));
1039 
1040     if (orted_failed_launch) {
1041         ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1042         return;
1043     } else {
1044         jdatorted->num_reported++;
1045         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1046                              "%s plm:base:orted_report_launch recvd %d of %d reported daemons",
1047                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1048                              jdatorted->num_reported, jdatorted->num_procs));
1049         if (jdatorted->num_procs == jdatorted->num_reported) {
1050             bool dvm = true;
1051             uint32_t key;
1052             void *nptr;
1053             jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1054             
1055 
1056 
1057             rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
1058             while (OPAL_SUCCESS == rc) {
1059                 if (ORTE_PROC_MY_NAME->jobid != jdata->jobid) {
1060                     dvm = false;
1061                     if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
1062                         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1063                     }
1064                 }
1065                 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
1066             }
1067             if (dvm) {
1068                 
1069                 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED);
1070             }
1071         }
1072     }
1073 }
1074 
1075 void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1076                                    opal_buffer_t *buffer,
1077                                    orte_rml_tag_t tag, void *cbdata)
1078 {
1079     char *ptr;
1080     int rc, idx;
1081     orte_proc_t *daemon=NULL;
1082     orte_job_t *jdata;
1083     orte_process_name_t dname;
1084     opal_buffer_t *relay;
1085     char *sig;
1086     orte_topology_t *t;
1087     hwloc_topology_t topo;
1088     int i;
1089     bool found;
1090     orte_daemon_cmd_flag_t cmd;
1091     int32_t flag;
1092     opal_value_t *kv;
1093     char *myendian;
1094 
1095     
1096     if (NULL == jdatorted) {
1097         jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1098     }
1099 
1100     
1101     t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
1102     if (NULL == t) {
1103         
1104         myendian = "unknown";
1105     } else {
1106         myendian = strrchr(t->sig, ':');
1107         ++myendian;
1108     }
1109 
1110     
1111     idx = 1;
1112     while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
1113         char *nodename = NULL;
1114 
1115         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1116                              "%s plm:base:orted_report_launch from daemon %s",
1117                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1118                              ORTE_NAME_PRINT(&dname)));
1119 
1120         
1121         if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, dname.vpid))) {
1122             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1123             orted_failed_launch = true;
1124             goto CLEANUP;
1125         }
1126         daemon->state = ORTE_PROC_STATE_RUNNING;
1127         
1128         ORTE_FLAG_SET(daemon, ORTE_PROC_FLAG_ALIVE);
1129 
1130         
1131 
1132         idx = 1;
1133         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT32))) {
1134             ORTE_ERROR_LOG(rc);
1135             orted_failed_launch = true;
1136             goto CLEANUP;
1137         }
1138         for (i=0; i < flag; i++) {
1139             idx = 1;
1140             if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &kv, &idx, OPAL_VALUE))) {
1141                 ORTE_ERROR_LOG(rc);
1142                 orted_failed_launch = true;
1143                 goto CLEANUP;
1144             }
1145             
1146             opal_pmix.store_local(&dname, kv);
1147             OBJ_RELEASE(kv);
1148         }
1149 
1150         
1151         idx = 1;
1152         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) {
1153             ORTE_ERROR_LOG(rc);
1154             orted_failed_launch = true;
1155             goto CLEANUP;
1156         }
1157         if (!orte_have_fqdn_allocation) {
1158             
1159             if (NULL != (ptr = strchr(nodename, '.'))) {
1160                 *ptr = '\0';
1161                 ptr = strdup(nodename);
1162                 free(nodename);
1163                 nodename = ptr;
1164             }
1165         }
1166 
1167         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1168                              "%s plm:base:orted_report_launch from daemon %s on node %s",
1169                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1170                              ORTE_NAME_PRINT(&daemon->name), nodename));
1171 
1172         
1173         ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
1174 
1175         if (orte_retain_aliases) {
1176             char *alias, **atmp=NULL;
1177             uint8_t naliases, ni;
1178             
1179 
1180 
1181 
1182 
1183 
1184 
1185             opal_argv_append_nosize(&atmp, nodename);
1186             
1187             idx = 1;
1188             if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &naliases, &idx, OPAL_UINT8))) {
1189                 ORTE_ERROR_LOG(rc);
1190                 orted_failed_launch = true;
1191                 goto CLEANUP;
1192             }
1193             for (ni=0; ni < naliases; ni++) {
1194                 idx = 1;
1195                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &alias, &idx, OPAL_STRING))) {
1196                     ORTE_ERROR_LOG(rc);
1197                     orted_failed_launch = true;
1198                     goto CLEANUP;
1199                 }
1200                 opal_argv_append_nosize(&atmp, alias);
1201                 free(alias);
1202             }
1203             if (0 < naliases) {
1204                 alias = opal_argv_join(atmp, ',');
1205                 orte_set_attribute(&daemon->node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
1206                 free(alias);
1207             }
1208             opal_argv_free(atmp);
1209         }
1210 
1211         
1212         idx=1;
1213         if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &idx, OPAL_STRING))) {
1214             ORTE_ERROR_LOG(rc);
1215             orted_failed_launch = true;
1216             goto CLEANUP;
1217         }
1218         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1219                              "%s RECEIVED TOPOLOGY SIG %s FROM NODE %s",
1220                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sig, nodename));
1221 
1222         
1223         topo = NULL;
1224         if (1 == dname.vpid) {
1225             uint8_t flag;
1226             size_t inlen, cmplen;
1227             uint8_t *packed_data, *cmpdata;
1228             opal_buffer_t datbuf, *data;
1229             OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
1230             
1231             idx=1;
1232             if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) {
1233                 ORTE_ERROR_LOG(rc);
1234                 orted_failed_launch = true;
1235                 goto CLEANUP;
1236             }
1237             if (flag) {
1238                 
1239                 idx=1;
1240                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) {
1241                     ORTE_ERROR_LOG(rc);
1242                     orted_failed_launch = true;
1243                     goto CLEANUP;
1244                 }
1245                 
1246                 idx=1;
1247                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) {
1248                     ORTE_ERROR_LOG(rc);
1249                     orted_failed_launch = true;
1250                     goto CLEANUP;
1251                 }
1252                 
1253                 packed_data = (uint8_t*)malloc(inlen);
1254                 
1255                 idx = inlen;
1256                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) {
1257                     ORTE_ERROR_LOG(rc);
1258                     orted_failed_launch = true;
1259                     goto CLEANUP;
1260                 }
1261                 
1262                 if (opal_compress.decompress_block(&cmpdata, cmplen,
1263                                                packed_data, inlen)) {
1264                     
1265                     opal_dss.load(&datbuf, cmpdata, cmplen);
1266                     data = &datbuf;
1267                 } else {
1268                     data = buffer;
1269                 }
1270                 free(packed_data);
1271             } else {
1272                 data = buffer;
1273             }
1274             idx=1;
1275             if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) {
1276                 ORTE_ERROR_LOG(rc);
1277                 orted_failed_launch = true;
1278                 goto CLEANUP;
1279             }
1280         }
1281 
1282         
1283         found = false;
1284         for (i=0; i < orte_node_topologies->size; i++) {
1285             if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, i))) {
1286                 continue;
1287             }
1288             
1289             if (0 == strcmp(sig, t->sig)) {
1290                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1291                                      "%s TOPOLOGY ALREADY RECORDED",
1292                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1293                 found = true;
1294                 daemon->node->topology = t;
1295                 if (NULL != topo) {
1296                     hwloc_topology_destroy(topo);
1297                 }
1298                 free(sig);
1299                 break;
1300             }
1301 #if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT
1302               else {
1303                 
1304                 ptr = strrchr(sig, ':');
1305                 ++ptr;
1306                 if (0 != strcmp(ptr, myendian)) {
1307                     
1308 
1309                     orte_show_help("help-plm-base", "multi-endian", true,
1310                                    nodename, ptr, myendian);
1311                     orted_failed_launch = true;
1312                     if (NULL != topo) {
1313                         hwloc_topology_destroy(topo);
1314                     }
1315                     goto CLEANUP;
1316                 }
1317             }
1318 #endif
1319         }
1320 
1321         if (!found) {
1322             
1323             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1324                                  "%s NEW TOPOLOGY - ADDING",
1325                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1326             t = OBJ_NEW(orte_topology_t);
1327             t->sig = sig;
1328             t->index = opal_pointer_array_add(orte_node_topologies, t);
1329             daemon->node->topology = t;
1330             if (NULL != topo) {
1331                 t->topo = topo;
1332             } else {
1333                 
1334                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1335                                      "%s REQUESTING TOPOLOGY FROM %s",
1336                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1337                                      ORTE_NAME_PRINT(&dname)));
1338                 
1339                 relay = OBJ_NEW(opal_buffer_t);
1340                 cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
1341                 if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
1342                     ORTE_ERROR_LOG(rc);
1343                     OBJ_RELEASE(relay);
1344                     orted_failed_launch = true;
1345                     goto CLEANUP;
1346                 }
1347                 
1348                 orte_rml.send_buffer_nb(&dname, relay,
1349                                         ORTE_RML_TAG_DAEMON,
1350                                         orte_rml_send_callback, NULL);
1351                 
1352 
1353                 if (NULL != nodename) {
1354                     free(nodename);
1355                     nodename = NULL;
1356                 }
1357                 idx = 1;
1358                 continue;
1359             }
1360         }
1361 
1362       CLEANUP:
1363         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1364                              "%s plm:base:orted_report_launch %s for daemon %s at contact %s",
1365                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1366                              orted_failed_launch ? "failed" : "completed",
1367                              ORTE_NAME_PRINT(&dname),
1368                              (NULL == daemon) ? "UNKNOWN" : daemon->rml_uri));
1369 
1370         if (NULL != nodename) {
1371             free(nodename);
1372             nodename = NULL;
1373         }
1374 
1375         if (orted_failed_launch) {
1376             ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1377             return;
1378         } else {
1379             jdatorted->num_reported++;
1380             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1381                                  "%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons",
1382                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1383                                  ORTE_JOBID_PRINT(jdatorted->jobid),
1384                                  jdatorted->num_reported, jdatorted->num_procs));
1385             if (jdatorted->num_procs == jdatorted->num_reported) {
1386                 bool dvm = true;
1387                 uint32_t key;
1388                 void *nptr;
1389                 jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1390                 
1391 
1392 
1393                 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
1394                 while (OPAL_SUCCESS == rc) {
1395                     if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
1396                         goto next;
1397                     }
1398                     dvm = false;
1399                     if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
1400                         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1401                     }
1402                   next:
1403                     rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
1404                 }
1405                 if (dvm) {
1406                     
1407                     ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED);
1408                 }
1409             }
1410         }
1411         idx = 1;
1412     }
1413     if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
1414         ORTE_ERROR_LOG(rc);
1415         ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1416     }
1417 }
1418 
1419 void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender,
1420                                  opal_buffer_t *buffer,
1421                                  orte_rml_tag_t tag, void *cbdata)
1422 {
1423     int status, rc;
1424     int32_t n;
1425     orte_vpid_t vpid;
1426     orte_proc_t *daemon=NULL;
1427 
1428     
1429     if (NULL == jdatorted) {
1430         jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1431     }
1432 
1433     
1434     n=1;
1435     if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) {
1436         ORTE_ERROR_LOG(rc);
1437         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
1438         goto finish;
1439     }
1440 
1441     
1442     n=1;
1443     if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, OPAL_INT))) {
1444         ORTE_ERROR_LOG(rc);
1445         status = ORTE_ERROR_DEFAULT_EXIT_CODE;
1446         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
1447     } else {
1448         ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
1449     }
1450 
1451     
1452     if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) {
1453         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1454         goto finish;
1455     }
1456     daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
1457     daemon->exit_code = status;
1458 
1459   finish:
1460     if (NULL == daemon) {
1461         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1462         return;
1463     }
1464     ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
1465 }
1466 
1467 int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
1468 {
1469     int i, loc;
1470     char **tmpv;
1471 
1472     
1473 
1474 
1475     loc = 0;
1476     
1477     tmpv = opal_argv_split(orte_launch_agent, ' ');
1478     for (i = 0; NULL != tmpv && NULL != tmpv[i]; ++i) {
1479         if (0 == strcmp(tmpv[i], "orted")) {
1480             loc = i;
1481         }
1482         opal_argv_append(argc, argv, tmpv[i]);
1483     }
1484     opal_argv_free(tmpv);
1485 
1486     return loc;
1487 }
1488 
1489 
1490 
1491 
1492 
1493 int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
1494                                           char *ess,
1495                                           int *proc_vpid_index)
1496 {
1497     char *param = NULL;
1498     const char **tmp_value, **tmp_value2;
1499     int loc_id;
1500     char *tmp_force = NULL;
1501     int i, j, cnt, rc;
1502     orte_job_t *jdata;
1503     unsigned long num_procs;
1504     bool ignore;
1505 
1506     
1507     if (orte_debug_flag) {
1508         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1509         opal_argv_append(argc, argv, "orte_debug");
1510         opal_argv_append(argc, argv, "1");
1511     }
1512     if (orte_debug_daemons_flag) {
1513         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1514         opal_argv_append(argc, argv, "orte_debug_daemons");
1515         opal_argv_append(argc, argv, "1");
1516     }
1517     if (orte_debug_daemons_file_flag) {
1518         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1519         opal_argv_append(argc, argv, "orte_debug_daemons_file");
1520         opal_argv_append(argc, argv, "1");
1521     }
1522     if (orte_leave_session_attached) {
1523         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1524         opal_argv_append(argc, argv, "orte_leave_session_attached");
1525         opal_argv_append(argc, argv, "1");
1526     }
1527 
1528     if (orted_spin_flag) {
1529         opal_argv_append(argc, argv, "--spin");
1530     }
1531 
1532     if (opal_hwloc_report_bindings) {
1533         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1534         opal_argv_append(argc, argv, "orte_report_bindings");
1535         opal_argv_append(argc, argv, "1");
1536     }
1537 
1538     if (orte_map_stddiag_to_stderr) {
1539         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1540         opal_argv_append(argc, argv, "orte_map_stddiag_to_stderr");
1541         opal_argv_append(argc, argv, "1");
1542     }
1543     else if (orte_map_stddiag_to_stdout) {
1544         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1545         opal_argv_append(argc, argv, "orte_map_stddiag_to_stdout");
1546         opal_argv_append(argc, argv, "1");
1547     }
1548 
1549     
1550     if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
1551         opal_argv_append(argc, argv, "--test-suicide");
1552     }
1553 
1554     
1555     if (NULL != ess) {
1556         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1557         opal_argv_append(argc, argv, "ess");
1558         opal_argv_append(argc, argv, ess);
1559     }
1560 
1561     
1562     opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1563     opal_argv_append(argc, argv, "ess_base_jobid");
1564     if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m, ORTE_PROC_MY_NAME->jobid))) {
1565         ORTE_ERROR_LOG(rc);
1566         return rc;
1567     }
1568     opal_argv_append(argc, argv, param);
1569     free(param);
1570 
1571     
1572     if (NULL != proc_vpid_index) {
1573         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1574         opal_argv_append(argc, argv, "ess_base_vpid");
1575         *proc_vpid_index = *argc;
1576         opal_argv_append(argc, argv, "<template>");
1577     }
1578 
1579     
1580     if (ORTE_PROC_IS_HNP) {
1581         jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1582         num_procs = jdata->num_procs;
1583     } else {
1584         num_procs = orte_process_info.num_procs;
1585     }
1586     opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1587     opal_argv_append(argc, argv, "ess_base_num_procs");
1588     opal_asprintf(¶m, "%lu", num_procs);
1589     opal_argv_append(argc, argv, param);
1590     free(param);
1591 
1592     
1593     opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1594     opal_argv_append(argc, argv, "orte_hnp_uri");
1595     opal_argv_append(argc, argv, orte_process_info.my_hnp_uri);
1596 
1597     
1598     if (NULL != orte_xterm) {
1599         opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1600         opal_argv_append(argc, argv, "orte_xterm");
1601         opal_argv_append(argc, argv, orte_xterm);
1602     }
1603 
1604     loc_id = mca_base_var_find("opal", "mca", "base", "param_files");
1605     if (loc_id < 0) {
1606         rc = OPAL_ERR_NOT_FOUND;
1607         ORTE_ERROR_LOG(rc);
1608         return rc;
1609     }
1610     tmp_value = NULL;
1611     rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1612     if (ORTE_SUCCESS != rc) {
1613         ORTE_ERROR_LOG(rc);
1614         return rc;
1615     }
1616     if (NULL != tmp_value && NULL != tmp_value[0]) {
1617         rc = strcmp(tmp_value[0], "none");
1618     } else {
1619         rc = 1;
1620     }
1621 
1622     if (0 != rc) {
1623         
1624 
1625 
1626         
1627         tmp_value = NULL;
1628 
1629         loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix");
1630         if (loc_id < 0) {
1631             rc = OPAL_ERR_NOT_FOUND;
1632             ORTE_ERROR_LOG(rc);
1633             return rc;
1634         }
1635         rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1636         if (ORTE_SUCCESS != rc) {
1637             ORTE_ERROR_LOG(rc);
1638             return rc;
1639         }
1640         if( NULL != tmp_value && NULL != tmp_value[0] ) {
1641             
1642 
1643 
1644             opal_argv_append(argc, argv, "-mca");
1645             opal_argv_append(argc, argv, "mca_base_envar_file_prefix");
1646             opal_argv_append(argc, argv, tmp_value[0]);
1647         }
1648 
1649         tmp_value2 = NULL;
1650         loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix");
1651         mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL);
1652         if( NULL != tmp_value2 && NULL != tmp_value2[0] ) {
1653             
1654 
1655 
1656             opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1657             opal_argv_append(argc, argv, "mca_base_param_file_prefix");
1658             opal_argv_append(argc, argv, tmp_value2[0]);
1659             orte_show_help("help-plm-base.txt", "deprecated-amca", true);
1660         }
1661 
1662         if ((NULL != tmp_value && NULL != tmp_value[0])
1663             || (NULL != tmp_value2 && NULL != tmp_value2[0])) {
1664             
1665             tmp_value = NULL;
1666             loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path");
1667             if (loc_id < 0) {
1668                 ORTE_ERROR_LOG(rc);
1669                 return rc;
1670             }
1671             rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1672             if (ORTE_SUCCESS != rc) {
1673                 ORTE_ERROR_LOG(rc);
1674                 return rc;
1675             }
1676             if( NULL != tmp_value && NULL != tmp_value[0] ) {
1677                 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1678                 opal_argv_append(argc, argv, "mca_base_param_file_path");
1679                 opal_argv_append(argc, argv, tmp_value[0]);
1680             }
1681 
1682             
1683             opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1684             opal_argv_append(argc, argv, "mca_base_param_file_path_force");
1685 
1686             tmp_value = NULL;
1687             loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force");
1688             if (loc_id < 0) {
1689                 rc = OPAL_ERR_NOT_FOUND;
1690                 ORTE_ERROR_LOG(rc);
1691                 return rc;
1692             }
1693             rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1694             if (OPAL_SUCCESS != rc) {
1695                 ORTE_ERROR_LOG(rc);
1696                 return rc;
1697             }
1698             if( NULL == tmp_value || NULL == tmp_value[0] ) {
1699                 
1700                 tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX);
1701                 if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) {
1702                     free(tmp_force);
1703                     tmp_force = strdup("");
1704                 }
1705 
1706                 opal_argv_append(argc, argv, tmp_force);
1707                 free(tmp_force);
1708             } else {
1709                 opal_argv_append(argc, argv, tmp_value[0]);
1710             }
1711         }
1712     }
1713 
1714     
1715 
1716 
1717 
1718     if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
1719         cnt = opal_argv_count(orted_cmd_line);
1720         for (i=0; i < cnt; i+=3) {
1721             
1722 
1723 
1724 
1725 
1726 
1727 
1728 
1729             if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
1730                 continue;
1731             }
1732             
1733 
1734 
1735 
1736 
1737 
1738             if (0 == strcmp(orted_cmd_line[i+1], "plm")) {
1739                 continue;
1740             }
1741             
1742             ignore = false;
1743             for (j=0; j < *argc; j++) {
1744                 if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) {
1745                     ignore = true;
1746                     break;
1747                 }
1748             }
1749             if (!ignore) {
1750                 
1751                 opal_argv_append(argc, argv, orted_cmd_line[i]);
1752                 opal_argv_append(argc, argv, orted_cmd_line[i+1]);
1753                 opal_argv_append(argc, argv, orted_cmd_line[i+2]);
1754             }
1755         }
1756     }
1757 
1758     return ORTE_SUCCESS;
1759 }
1760 
1761 int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
1762 {
1763     orte_node_t *node, *nptr;
1764     orte_proc_t *proc, *pptr;
1765     orte_job_map_t *map=NULL;
1766     int rc, i;
1767     orte_job_t *daemons;
1768     opal_list_t nodes, tnodes;
1769     opal_list_item_t *item, *next;
1770     orte_app_context_t *app;
1771     bool one_filter = false;
1772     int num_nodes;
1773     bool default_hostfile_used;
1774     char *hosts = NULL;
1775     bool singleton=false;
1776     bool multi_sim = false;
1777 
1778     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1779                          "%s plm:base:setup_vm",
1780                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1781 
1782     if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1783         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1784         return ORTE_ERR_NOT_FOUND;
1785     }
1786     if (NULL == daemons->map) {
1787         daemons->map = OBJ_NEW(orte_job_map_t);
1788     }
1789     map = daemons->map;
1790 
1791     
1792 
1793     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
1794         
1795         daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1796         map->num_new_daemons = 0;
1797         return ORTE_SUCCESS;
1798     }
1799 
1800     
1801 
1802 
1803     if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
1804         if (0 == map->num_nodes) {
1805             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1806                                  "%s plm:base:setup_vm creating map",
1807                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1808             
1809 
1810 
1811 
1812 
1813             node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1814             opal_pointer_array_add(map->nodes, (void*)node);
1815             ++(map->num_nodes);
1816             
1817             OBJ_RETAIN(node);
1818             
1819             singleton = true;
1820         }
1821         OBJ_CONSTRUCT(&nodes, opal_list_t);
1822         for (i=1; i < orte_node_pool->size; i++) {
1823             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
1824                 continue;
1825             }
1826             
1827             if (!singleton && ORTE_NODE_STATE_ADDED != node->state) {
1828                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1829                                      "%s plm_base:setup_vm NODE %s WAS NOT ADDED",
1830                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
1831                 continue;
1832             }
1833             OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1834                                  "%s plm_base:setup_vm ADDING NODE %s",
1835                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
1836             
1837 
1838 
1839             OBJ_RETAIN(node);
1840             opal_list_append(&nodes, &node->super);
1841             
1842             node->state = ORTE_NODE_STATE_UP;
1843         }
1844         map->num_new_daemons = 0;
1845         
1846 
1847 
1848         if (0 == opal_list_get_size(&nodes)) {
1849             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1850                                  "%s plm:base:setup_vm no new daemons required",
1851                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1852             OBJ_DESTRUCT(&nodes);
1853             
1854             daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1855             ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
1856             return ORTE_SUCCESS;
1857         }
1858         
1859         goto process;
1860     }
1861 
1862     
1863 
1864 
1865 
1866     multi_sim = orte_get_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM, NULL, OPAL_BOOL);
1867     if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL) || multi_sim) {
1868         OBJ_CONSTRUCT(&nodes, opal_list_t);
1869         
1870 
1871 
1872         for (i=1; i < orte_node_pool->size; i++) {
1873             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
1874                 continue;
1875             }
1876             
1877             if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
1878                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1879                                      "NODE %s IS MARKED NO_USE", node->name));
1880                 
1881                 node->state = ORTE_NODE_STATE_UP;
1882                 continue;
1883             }
1884             if (ORTE_NODE_STATE_DOWN == node->state) {
1885                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1886                                      "NODE %s IS MARKED DOWN", node->name));
1887                 continue;
1888             }
1889             if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
1890                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1891                                      "NODE %s IS MARKED NO_INCLUDE", node->name));
1892                 
1893                 continue;
1894             }
1895             if (0 < node->num_procs || multi_sim) {
1896                 
1897 
1898 
1899                 OBJ_RETAIN(node);
1900                 opal_list_append(&nodes, &node->super);
1901             }
1902         }
1903         if (multi_sim) {
1904             goto process;
1905         }
1906         
1907         if (0 == opal_list_get_size(&nodes)) {
1908             
1909             node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1910             if (0 < node->num_procs) {
1911                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1912                                      "%s plm:base:setup_vm only HNP in use",
1913                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1914                 OBJ_DESTRUCT(&nodes);
1915                 map->num_nodes = 1;
1916                 
1917                 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1918                 return ORTE_SUCCESS;
1919             }
1920             
1921 
1922 
1923             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1924             return ORTE_ERR_FATAL;
1925         }
1926         goto process;
1927     }
1928 
1929     if (0 == map->num_nodes) {
1930         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1931                              "%s plm:base:setup_vm creating map",
1932                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1933         
1934 
1935 
1936 
1937 
1938         node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1939         opal_pointer_array_add(map->nodes, (void*)node);
1940         ++(map->num_nodes);
1941         
1942         OBJ_RETAIN(node);
1943     }
1944 
1945     
1946 
1947 
1948     map->num_new_daemons = 0;
1949 
1950     
1951     OBJ_CONSTRUCT(&nodes, opal_list_t);
1952 
1953     
1954 
1955 
1956 
1957 
1958     if (!orte_managed_allocation) {
1959         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1960                              "%s setup:vm: working unmanaged allocation",
1961                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1962         default_hostfile_used = false;
1963         OBJ_CONSTRUCT(&tnodes, opal_list_t);
1964         for (i=0; i < jdata->apps->size; i++) {
1965             if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
1966                 continue;
1967             }
1968             
1969 
1970 
1971             hosts = NULL;
1972             if (!orte_soft_locations &&
1973                 orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
1974                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1975                                      "%s using dash_host",
1976                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1977                 if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&tnodes, hosts, false))) {
1978                     ORTE_ERROR_LOG(rc);
1979                     free(hosts);
1980                     return rc;
1981                 }
1982                 free(hosts);
1983             } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
1984                 
1985                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1986                                      "%s using hostfile %s",
1987                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
1988                 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes, hosts))) {
1989                     ORTE_ERROR_LOG(rc);
1990                     free(hosts);
1991                     return rc;
1992                 }
1993                 free(hosts);
1994             } else if (NULL != orte_rankfile) {
1995                 
1996                 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1997                                      "%s using rankfile %s",
1998                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1999                                      orte_rankfile));
2000                 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes,
2001                                                                        orte_rankfile))) {
2002                     ORTE_ERROR_LOG(rc);
2003                     return rc;
2004                 }
2005             } else if (NULL != orte_default_hostfile) {
2006                 if (!default_hostfile_used) {
2007                     
2008                     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2009                                          "%s using default hostfile %s",
2010                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2011                                          orte_default_hostfile));
2012                     if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes,
2013                                                                            orte_default_hostfile))) {
2014                         ORTE_ERROR_LOG(rc);
2015                         return rc;
2016                     }
2017                     
2018                     default_hostfile_used = true;
2019                 }
2020             }
2021         }
2022         
2023 
2024 
2025 
2026         while (NULL != (item = opal_list_remove_first(&tnodes))) {
2027             nptr = (orte_node_t*)item;
2028             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2029                                  "%s checking node %s",
2030                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2031                                  nptr->name));
2032             for (i=0; i < orte_node_pool->size; i++) {
2033                 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
2034                     continue;
2035                 }
2036                 if (0 != strcmp(node->name, nptr->name)) {
2037                     continue;
2038                 }
2039                 
2040                 
2041                 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2042                     OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2043                                          "NODE %s IS MARKED NO_USE", node->name));
2044                     
2045                     node->state = ORTE_NODE_STATE_UP;
2046                     break;
2047                 }
2048                 if (ORTE_NODE_STATE_DOWN == node->state) {
2049                     OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2050                                          "NODE %s IS MARKED DOWN", node->name));
2051                     break;
2052                 }
2053                 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2054                     OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2055                                          "NODE %s IS MARKED NO_INCLUDE", node->name));
2056                     break;
2057                 }
2058                 
2059                 if (0 == node->index) {
2060                     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2061                                          "%s ignoring myself",
2062                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2063                     break;
2064                 }
2065                 
2066                 OBJ_RETAIN(node);
2067                 opal_list_append(&nodes, &node->super);
2068             }
2069             OBJ_RELEASE(nptr);
2070         }
2071         OPAL_LIST_DESTRUCT(&tnodes);
2072         
2073 
2074 
2075 
2076         if (0 == opal_list_get_size(&nodes)) {
2077             OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2078                                  "%s plm:base:setup_vm only HNP in allocation",
2079                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2080             OBJ_DESTRUCT(&nodes);
2081             
2082             daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2083             ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2084             return ORTE_SUCCESS;
2085         }
2086         
2087         goto process;
2088     }
2089 
2090     
2091     for (i=1; i < orte_node_pool->size; i++) {
2092         if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
2093             
2094             if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2095                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2096                                      "NODE %s IS MARKED NO_USE", node->name));
2097                 
2098                 node->state = ORTE_NODE_STATE_UP;
2099                 continue;
2100             }
2101             if (ORTE_NODE_STATE_DOWN == node->state) {
2102                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2103                                      "NODE %s IS MARKED DOWN", node->name));
2104                 continue;
2105             }
2106             if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2107                 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2108                                      "NODE %s IS MARKED NO_INCLUDE", node->name));
2109                 
2110                 continue;
2111             }
2112             
2113 
2114 
2115             OBJ_RETAIN(node);
2116             opal_list_append(&nodes, &node->super);
2117             
2118 
2119 
2120             ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
2121         }
2122     }
2123 
2124     
2125 
2126 
2127 
2128     if (0 == opal_list_get_size(&nodes)) {
2129         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2130                              "%s plm:base:setup_vm only HNP in allocation",
2131                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2132         
2133         OBJ_DESTRUCT(&nodes);
2134         
2135         daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2136         ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2137         return ORTE_SUCCESS;
2138     }
2139 
2140     
2141 
2142 
2143 
2144 
2145 
2146     if (orte_hnp_is_allocated) {
2147         node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
2148         OBJ_RETAIN(node);
2149         opal_list_prepend(&nodes, &node->super);
2150     }
2151     for (i=0; i < jdata->apps->size; i++) {
2152         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
2153             continue;
2154         }
2155         if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, &nodes, false)) &&
2156             rc != ORTE_ERR_TAKE_NEXT_OPTION) {
2157             ORTE_ERROR_LOG(rc);
2158             return rc;
2159         }
2160         if (ORTE_SUCCESS == rc) {
2161             
2162             one_filter = true;
2163         }
2164     }
2165 
2166     if (one_filter) {
2167         
2168 
2169 
2170         item = opal_list_get_first(&nodes);
2171         while (item != opal_list_get_end(&nodes)) {
2172             next = opal_list_get_next(item);
2173             node = (orte_node_t*)item;
2174             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
2175                 opal_list_remove_item(&nodes, item);
2176                 OBJ_RELEASE(item);
2177             } else {
2178                 
2179 
2180 
2181                 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
2182             }
2183             item = next;
2184         }
2185     }
2186 
2187     
2188     if (0 < opal_list_get_size(&nodes)) {
2189         item = opal_list_get_first(&nodes);
2190         node = (orte_node_t*)item;
2191         if (0 == node->index) {
2192             opal_list_remove_item(&nodes, item);
2193             OBJ_RELEASE(item);
2194         }
2195     }
2196 
2197     
2198 
2199 
2200 
2201     if (0 == opal_list_get_size(&nodes)) {
2202         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2203                              "%s plm:base:setup_vm only HNP left",
2204                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2205         OBJ_DESTRUCT(&nodes);
2206         
2207         daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2208         ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2209         return ORTE_SUCCESS;
2210     }
2211 
2212  process:
2213     
2214 
2215 
2216 
2217 
2218 
2219     if (orte_hnp_is_allocated) {
2220         num_nodes = 1;
2221     } else {
2222         num_nodes = 0;
2223     }
2224     while (NULL != (item = opal_list_remove_first(&nodes))) {
2225         
2226         if (0 < orte_max_vm_size && num_nodes == orte_max_vm_size) {
2227             
2228             OBJ_RELEASE(item);
2229             break;
2230         }
2231         node = (orte_node_t*)item;
2232         
2233         if (NULL != node->daemon) {
2234             num_nodes++;
2235             
2236             OBJ_RELEASE(item);
2237             continue;
2238         }
2239         
2240 
2241 
2242 
2243         opal_pointer_array_add(map->nodes, (void*)node);
2244         ++(map->num_nodes);
2245         num_nodes++;
2246         
2247         proc = OBJ_NEW(orte_proc_t);
2248         if (NULL == proc) {
2249             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
2250             return ORTE_ERR_OUT_OF_RESOURCE;
2251         }
2252         proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
2253         if (ORTE_VPID_MAX-1 <= daemons->num_procs) {
2254             
2255             orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
2256             OBJ_RELEASE(proc);
2257             return ORTE_ERR_OUT_OF_RESOURCE;
2258         }
2259         proc->name.vpid = daemons->num_procs;  
2260         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2261                              "%s plm:base:setup_vm add new daemon %s",
2262                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2263                              ORTE_NAME_PRINT(&proc->name)));
2264         
2265         if (0 > (rc = opal_pointer_array_set_item(daemons->procs, proc->name.vpid, (void*)proc))) {
2266             ORTE_ERROR_LOG(rc);
2267             return rc;
2268         }
2269         ++daemons->num_procs;
2270         OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2271                              "%s plm:base:setup_vm assigning new daemon %s to node %s",
2272                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2273                              ORTE_NAME_PRINT(&proc->name),
2274                              node->name));
2275         
2276         node->daemon = proc;
2277         OBJ_RETAIN(proc);  
2278         
2279         proc->node = node;
2280         OBJ_RETAIN(node);
2281         if (orte_plm_globals.daemon_nodes_assigned_at_launch) {
2282             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
2283         } else {
2284             ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
2285         }
2286         
2287         ++map->num_new_daemons;
2288         
2289         if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
2290             map->daemon_vpid_start = proc->name.vpid;
2291         }
2292         
2293         for (i=0; i < node->procs->size; i++) {
2294             if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
2295                 pptr->parent = proc->name.vpid;
2296             }
2297         }
2298     }
2299 
2300     if (orte_process_info.num_procs != daemons->num_procs) {
2301         
2302 
2303 
2304 
2305 
2306 
2307 
2308         orte_process_info.num_procs = daemons->num_procs;
2309 
2310         if (orte_process_info.max_procs < orte_process_info.num_procs) {
2311             orte_process_info.max_procs = orte_process_info.num_procs;
2312         }
2313 
2314         
2315 
2316         orte_routed.update_routing_plan();
2317     }
2318 
2319     
2320     ORTE_FLAG_SET(daemons, ORTE_JOB_FLAG_UPDATED);
2321 
2322     
2323 
2324     if (0 < map->num_new_daemons) {
2325         if (ORTE_SUCCESS != (rc = orte_set_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS,
2326                                                      true, NULL, OPAL_BOOL))) {
2327             ORTE_ERROR_LOG(rc);
2328             return rc;
2329         }
2330     }
2331 
2332     return ORTE_SUCCESS;
2333 }