This source file includes following definitions.
- orte_daemon
- pipe_closed
- shutdown_callback
- rollup
- report_orted
- node_regex_report
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 #include "orte_config.h"
  30 #include "orte/constants.h"
  31 
  32 #include <string.h>
  33 
  34 #include <stdio.h>
  35 #include <ctype.h>
  36 #ifdef HAVE_UNISTD_H
  37 #include <unistd.h>
  38 #endif
  39 #ifdef HAVE_NETDB_H
  40 #include <netdb.h>
  41 #endif
  42 #ifdef HAVE_SYS_PARAM_H
  43 #include <sys/param.h>
  44 #endif
  45 #include <fcntl.h>
  46 #include <errno.h>
  47 #include <signal.h>
  48 #ifdef HAVE_SYS_TIME_H
  49 #include <sys/time.h>
  50 #endif  
  51 
  52 
  53 #include "opal/mca/event/event.h"
  54 #include "opal/mca/base/base.h"
  55 #include "opal/util/output.h"
  56 #include "opal/util/cmd_line.h"
  57 #include "opal/util/if.h"
  58 #include "opal/util/net.h"
  59 #include "opal/util/opal_environ.h"
  60 #include "opal/util/os_path.h"
  61 #include "opal/util/printf.h"
  62 #include "opal/util/argv.h"
  63 #include "opal/util/fd.h"
  64 #include "opal/runtime/opal.h"
  65 #include "opal/mca/base/mca_base_var.h"
  66 #include "opal/util/daemon_init.h"
  67 #include "opal/dss/dss.h"
  68 #include "opal/mca/hwloc/hwloc-internal.h"
  69 #include "opal/mca/pmix/pmix.h"
  70 #include "opal/mca/compress/compress.h"
  71 
  72 #include "orte/util/show_help.h"
  73 #include "orte/util/proc_info.h"
  74 #include "orte/util/session_dir.h"
  75 #include "orte/util/name_fns.h"
  76 #include "orte/util/nidmap.h"
  77 #include "orte/util/parse_options.h"
  78 #include "orte/mca/rml/base/rml_contact.h"
  79 #include "orte/util/pre_condition_transports.h"
  80 #include "orte/util/threads.h"
  81 
  82 #include "orte/mca/errmgr/errmgr.h"
  83 #include "orte/mca/ess/ess.h"
  84 #include "orte/mca/grpcomm/grpcomm.h"
  85 #include "orte/mca/grpcomm/base/base.h"
  86 #include "orte/mca/rml/rml.h"
  87 #include "orte/mca/rml/rml_types.h"
  88 #include "orte/mca/odls/odls.h"
  89 #include "orte/mca/odls/base/odls_private.h"
  90 #include "orte/mca/oob/base/base.h"
  91 #include "orte/mca/plm/plm.h"
  92 #include "orte/mca/ras/ras.h"
  93 #include "orte/mca/routed/routed.h"
  94 #include "orte/mca/rmaps/rmaps_types.h"
  95 #include "orte/mca/state/state.h"
  96 
  97 
  98 
  99 
 100 #include "orte/mca/plm/base/plm_private.h"
 101 
 102 #include "orte/runtime/runtime.h"
 103 #include "orte/runtime/orte_globals.h"
 104 #include "orte/runtime/orte_locks.h"
 105 #include "orte/runtime/orte_quit.h"
 106 #include "orte/runtime/orte_wait.h"
 107 
 108 #include "orte/orted/orted.h"
 109 #include "orte/orted/pmix/pmix_server.h"
 110 
 111 
 112 
 113 
 114 static opal_event_t *pipe_handler;
 115 static void shutdown_callback(int fd, short flags, void *arg);
 116 static void pipe_closed(int fd, short flags, void *arg);
 117 static void rollup(int status, orte_process_name_t* sender,
 118                    opal_buffer_t *buffer,
 119                    orte_rml_tag_t tag, void *cbdata);
 120 static void node_regex_report(int status, orte_process_name_t* sender,
 121                               opal_buffer_t *buffer,
 122                               orte_rml_tag_t tag, void *cbdata);
 123 static void report_orted(void);
 124 
 125 static opal_buffer_t *bucket, *mybucket = NULL;
 126 static int ncollected = 0;
 127 static bool node_regex_waiting = false;
 128 
 129 static char *orte_parent_uri = NULL;
 130 
 131 static struct {
 132     bool debug;
 133     bool help;
 134     bool set_sid;
 135     bool hnp;
 136     bool daemonize;
 137     char* name;
 138     char* vpid_start;
 139     char* num_procs;
 140     int uri_pipe;
 141     int singleton_died_pipe;
 142     bool abort;
 143     bool tree_spawn;
 144     bool test_suicide;
 145 } orted_globals;
 146 
 147 
 148 
 149 
 150 opal_cmd_line_init_t orte_cmd_line_opts[] = {
 151     
 152     { NULL, 'h', NULL, "help", 0,
 153       &orted_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
 154       "This help message" },
 155 
 156     { "orte_daemon_spin", 's', NULL, "spin", 0,
 157       &orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
 158       "Have the orted spin until we can connect a debugger to it" },
 159 
 160     { NULL, '\0', NULL, "test-suicide", 1,
 161       &orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL,
 162       "Suicide instead of clean abort after delay" },
 163 
 164     { "orte_debug", 'd', NULL, "debug", 0,
 165       NULL, OPAL_CMD_LINE_TYPE_BOOL,
 166       "Debug the OpenRTE" },
 167 
 168     { "orte_daemonize", '\0', NULL, "daemonize", 0,
 169       &orted_globals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
 170       "Daemonize the orted into the background" },
 171 
 172     { "orte_debug_daemons", '\0', NULL, "debug-daemons", 0,
 173       &orted_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
 174       "Enable debugging of OpenRTE daemons" },
 175 
 176     { "orte_debug_daemons_file", '\0', NULL, "debug-daemons-file", 0,
 177       NULL, OPAL_CMD_LINE_TYPE_BOOL,
 178       "Enable debugging of OpenRTE daemons, storing output in files" },
 179 
 180     { NULL, '\0', NULL, "hnp", 0,
 181       &orted_globals.hnp, OPAL_CMD_LINE_TYPE_BOOL,
 182       "Direct the orted to act as the HNP"},
 183 
 184     { "orte_hnp_uri", '\0', NULL, "hnp-uri", 1,
 185       NULL, OPAL_CMD_LINE_TYPE_STRING,
 186       "URI for the HNP"},
 187 
 188     { "orte_parent_uri", '\0', NULL, "parent-uri", 1,
 189       NULL, OPAL_CMD_LINE_TYPE_STRING,
 190       "URI for the parent if tree launch is enabled."},
 191 
 192     { NULL, '\0', NULL, "set-sid", 0,
 193       &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
 194       "Direct the orted to separate from the current session"},
 195 
 196     { NULL, '\0', "tree-spawn", "tree-spawn", 0,
 197       &orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
 198       "Tree-based spawn in progress" },
 199 
 200     { "tmpdir_base", '\0', NULL, "tmpdir", 1,
 201       NULL, OPAL_CMD_LINE_TYPE_STRING,
 202       "Set the root for the session directory tree" },
 203 
 204     { NULL, '\0', NULL, "report-uri", 1,
 205       &orted_globals.uri_pipe, OPAL_CMD_LINE_TYPE_INT,
 206       "Report this process' uri on indicated pipe"},
 207 
 208     { NULL, '\0', NULL, "singleton-died-pipe", 1,
 209       &orted_globals.singleton_died_pipe, OPAL_CMD_LINE_TYPE_INT,
 210       "Watch on indicated pipe for singleton termination"},
 211 
 212     { "orte_output_filename", '\0', "output-filename", "output-filename", 1,
 213       NULL, OPAL_CMD_LINE_TYPE_STRING,
 214       "Redirect output from application processes into filename.rank" },
 215 
 216     { "orte_xterm", '\0', "xterm", "xterm", 1,
 217       NULL, OPAL_CMD_LINE_TYPE_STRING,
 218       "Create a new xterm window and display output from the specified ranks there" },
 219 
 220     { "orte_report_bindings", '\0', "report-bindings", "report-bindings", 0,
 221       NULL, OPAL_CMD_LINE_TYPE_BOOL,
 222       "Whether to report process bindings to stderr" },
 223 
 224     
 225     { NULL, '\0', NULL, NULL, 0,
 226       NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
 227 };
 228 
 229 int orte_daemon(int argc, char *argv[])
 230 {
 231     int ret = 0;
 232     opal_cmd_line_t *cmd_line = NULL;
 233     int i;
 234     opal_buffer_t *buffer;
 235     char hostname[OPAL_MAXHOSTNAMELEN];
 236 #if OPAL_ENABLE_FT_CR == 1
 237     char *tmp_env_var = NULL;
 238 #endif
 239     opal_value_t val;
 240 
 241     
 242     memset(&orted_globals, 0, sizeof(orted_globals));
 243     
 244     orted_globals.singleton_died_pipe = -1;
 245     bucket = OBJ_NEW(opal_buffer_t);
 246 
 247     
 248     cmd_line = OBJ_NEW(opal_cmd_line_t);
 249     if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
 250         OBJ_RELEASE(cmd_line);
 251         exit(1);
 252     }
 253     mca_base_cmd_line_setup(cmd_line);
 254     if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
 255                                                    argc, argv))) {
 256         char *args = NULL;
 257         args = opal_cmd_line_get_usage_msg(cmd_line);
 258         fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
 259         free(args);
 260         OBJ_RELEASE(cmd_line);
 261         return ret;
 262     }
 263 
 264     
 265 
 266 
 267 
 268     mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
 269 
 270     
 271     
 272 
 273 
 274 
 275 
 276 
 277 
 278 
 279 
 280 
 281 
 282     if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
 283         fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
 284         exit(1);
 285     }
 286 
 287     
 288 
 289 
 290 
 291 
 292 
 293     orte_launch_environ = opal_argv_copy(environ);
 294 
 295     
 296     opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
 297     opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);
 298 
 299     
 300 
 301 
 302     if (orted_globals.debug) {
 303         gethostname(hostname, sizeof(hostname));
 304         fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
 305     }
 306 
 307     
 308     if (orted_globals.help) {
 309         char *args = NULL;
 310         args = opal_cmd_line_get_usage_msg(cmd_line);
 311         orte_show_help("help-orted.txt", "orted:usage", false,
 312                        argv[0], args);
 313         free(args);
 314         return 1;
 315     }
 316 #if defined(HAVE_SETSID)
 317     
 318     if (orted_globals.set_sid) {
 319         setsid();
 320     }
 321 #endif
 322     
 323     i=0;
 324     while (orted_spin_flag) {
 325         i++;
 326         if (1000 < i) i=0;
 327     }
 328 
 329 #if OPAL_ENABLE_FT_CR == 1
 330     
 331     (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
 332     opal_setenv(tmp_env_var,
 333                 "1",
 334                 true, &environ);
 335     free(tmp_env_var);
 336 #endif
 337 
 338     
 339 
 340 
 341     if(!orte_debug_flag &&
 342        !orte_debug_daemons_flag &&
 343        orted_globals.daemonize) {
 344         opal_daemon_init(NULL);
 345     }
 346 
 347     
 348 
 349 
 350 
 351 
 352     if (orted_globals.hnp) {
 353         if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
 354             ORTE_ERROR_LOG(ret);
 355             return ret;
 356         }
 357     } else {
 358         if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
 359             ORTE_ERROR_LOG(ret);
 360             return ret;
 361         }
 362     }
 363 
 364     
 365 
 366 
 367     opal_finalize_util();
 368 
 369     
 370     if (NULL != orte_daemon_cores) {
 371         char **cores=NULL, tmp[128];
 372         hwloc_obj_t pu;
 373         hwloc_cpuset_t ours, res;
 374         int core;
 375 
 376         
 377 
 378 
 379         orte_util_parse_range_options(orte_daemon_cores, &cores);
 380         if (NULL != cores) {
 381             ours = hwloc_bitmap_alloc();
 382             hwloc_bitmap_zero(ours);
 383             res = hwloc_bitmap_alloc();
 384             for (i=0; NULL != cores[i]; i++) {
 385                 core = strtoul(cores[i], NULL, 10);
 386                 if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
 387                     
 388 
 389 
 390                     orte_show_help_finalize();
 391                     
 392                     orte_show_help("help-orted.txt", "orted:cannot-bind",
 393                                    true, orte_process_info.nodename,
 394                                    orte_daemon_cores);
 395                     ret = ORTE_ERR_NOT_SUPPORTED;
 396                     hwloc_bitmap_free(ours);
 397                     hwloc_bitmap_free(res);
 398                     goto DONE;
 399                 }
 400                 hwloc_bitmap_or(res, ours, pu->cpuset);
 401                 hwloc_bitmap_copy(ours, res);
 402             }
 403             
 404             if (!hwloc_bitmap_iszero(ours)) {
 405                 (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
 406                 if (opal_hwloc_report_bindings) {
 407                     opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
 408                     opal_output(0, "Daemon %s is bound to cores %s",
 409                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
 410                 }
 411             }
 412             
 413             hwloc_bitmap_free(ours);
 414             hwloc_bitmap_free(res);
 415             opal_argv_free(cores);
 416         }
 417     }
 418 
 419     if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
 420         orted_globals.abort=false;
 421         
 422 
 423 
 424 
 425         if (0 > orted_debug_failure) {
 426             orted_debug_failure = -1*orted_debug_failure;
 427             orted_globals.abort = true;
 428         }
 429         
 430         if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
 431             
 432 
 433 
 434             if (0 < orted_debug_failure_delay) {
 435                 ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);
 436 
 437             } else {
 438                 opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 439                             orted_globals.abort ? "abort" : "abnormal termination");
 440 
 441                 
 442 
 443 
 444 
 445                 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
 446 
 447                 
 448                 if (orted_globals.abort) {
 449                     abort();
 450                 }
 451 
 452                 
 453                 ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
 454                 goto DONE;
 455             }
 456         }
 457     }
 458 
 459     
 460 
 461 
 462     orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
 463     if (NULL == orte_process_info.my_daemon_uri) {
 464         
 465         ret = ORTE_ERROR;
 466         goto DONE;
 467     }
 468     ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
 469     ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
 470     OBJ_CONSTRUCT(&val, opal_value_t);
 471     val.key = OPAL_PMIX_PROC_URI;
 472     val.type = OPAL_STRING;
 473     val.data.string = orte_process_info.my_daemon_uri;
 474     if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
 475         ORTE_ERROR_LOG(ret);
 476         val.key = NULL;
 477         val.data.string = NULL;
 478         OBJ_DESTRUCT(&val);
 479         goto DONE;
 480     }
 481     val.key = NULL;
 482     val.data.string = NULL;
 483     OBJ_DESTRUCT(&val);
 484 
 485     
 486     if (ORTE_PROC_IS_HNP) {
 487         orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
 488         ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
 489         ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
 490     }
 491 
 492     
 493     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
 494                             ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
 495 
 496     
 497 
 498 
 499     if (orte_debug_daemons_flag) {
 500         fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
 501                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
 502                 orte_process_info.nodename);
 503     }
 504 
 505     
 506 
 507 
 508 
 509 
 510 
 511 
 512 
 513 
 514 
 515 
 516 
 517 
 518 
 519 
 520 
 521 
 522 
 523     opal_progress_set_yield_when_idle(false);
 524 
 525     
 526 
 527 
 528 
 529 
 530 
 531 
 532 
 533     opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
 534 
 535     
 536     if (orted_globals.uri_pipe > 0) {
 537         orte_job_t *jdata;
 538         orte_proc_t *proc;
 539         orte_node_t *node;
 540         orte_app_context_t *app;
 541         char *tmp, *nptr, *sysinfo;
 542         char **singenv=NULL, *string_key, *env_str;
 543 
 544         
 545         jdata = OBJ_NEW(orte_job_t);
 546         
 547         opal_argv_append_nosize(&jdata->personality, "ompi");
 548         orte_plm_base_create_jobid(jdata);
 549         opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
 550 
 551         
 552 
 553 
 554 
 555 
 556         jdata->map = OBJ_NEW(orte_job_map_t);
 557 
 558         
 559         app = OBJ_NEW(orte_app_context_t);
 560         app->app = strdup("singleton");
 561         app->num_procs = 1;
 562         opal_pointer_array_add(jdata->apps, app);
 563         jdata->num_apps = 1;
 564 
 565         
 566 
 567 
 568 
 569 
 570         proc = OBJ_NEW(orte_proc_t);
 571         proc->name.jobid = jdata->jobid;
 572         proc->name.vpid = 0;
 573         proc->parent = 0;
 574         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
 575         proc->state = ORTE_PROC_STATE_RUNNING;
 576         proc->app_idx = 0;
 577         
 578         node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
 579         proc->node = node;
 580         OBJ_RETAIN(node);  
 581         opal_pointer_array_add(jdata->procs, proc);
 582         jdata->num_procs = 1;
 583         
 584         OBJ_RETAIN(node);
 585         opal_pointer_array_add(jdata->map->nodes, node);
 586         jdata->map->num_nodes++;
 587         
 588         OBJ_RETAIN(proc);
 589         opal_pointer_array_add(node->procs, proc);
 590         node->num_procs++;
 591         
 592         OBJ_RETAIN(proc);
 593         opal_pointer_array_add(orte_local_children, proc);
 594         jdata->num_local_procs = 1;
 595         
 596         proc->local_rank = 0;
 597         proc->node_rank = 0;
 598         proc->app_rank = 0;
 599         proc->state = ORTE_PROC_STATE_RUNNING;
 600         proc->app_idx = 0;
 601         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
 602 
 603         
 604         orte_pre_condition_transports(jdata, NULL);
 605 
 606         
 607         if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
 608           ORTE_ERROR_LOG(ret);
 609           goto DONE;
 610         }
 611         
 612         if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
 613             ORTE_ERROR_LOG(ret);
 614             goto DONE;
 615         }
 616 
 617         
 618         if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
 619             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 620             goto DONE;
 621         }
 622         opal_asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
 623         opal_argv_append_nosize(&singenv, env_str);
 624         free(env_str);
 625 
 626         nptr = opal_argv_join(singenv, '*');
 627         opal_argv_free(singenv);
 628 
 629         
 630         orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
 631         opal_asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
 632         free(sysinfo);
 633         free(nptr);
 634 
 635         
 636         if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; 
 637             ORTE_ERROR_LOG(ret);
 638             goto DONE;
 639         }
 640 
 641         
 642         free(tmp);
 643         close(orted_globals.uri_pipe);
 644 
 645         
 646 
 647 
 648 
 649 
 650         for (i=0; NULL != environ[i]; i++) {
 651             if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
 652                 
 653                 tmp = strdup(environ[i]);
 654                 
 655                 nptr = strchr(tmp, '=');
 656                 *nptr = '\0';
 657                 nptr++;
 658                 
 659                 opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
 660                 opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
 661                 opal_argv_append_nosize(&orted_cmd_line, nptr);
 662                 free(tmp);
 663             }
 664         }
 665     }
 666 
 667     
 668     if (orted_globals.singleton_died_pipe > 0) {
 669         
 670         pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
 671         opal_event_set(orte_event_base, pipe_handler,
 672                        orted_globals.singleton_died_pipe,
 673                        OPAL_EV_READ,
 674                        pipe_closed,
 675                        pipe_handler);
 676         opal_event_add(pipe_handler, NULL);
 677     }
 678 
 679     
 680 
 681 
 682     orte_parent_uri = NULL;
 683     (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
 684                                   "URI for the parent if tree launch is enabled.",
 685                                   MCA_BASE_VAR_TYPE_STRING, NULL, 0,
 686                                   MCA_BASE_VAR_FLAG_INTERNAL,
 687                                   OPAL_INFO_LVL_9,
 688                                   MCA_BASE_VAR_SCOPE_CONSTANT,
 689                                   &orte_parent_uri);
 690     if (NULL != orte_parent_uri) {
 691         
 692         ret = orte_rml_base_parse_uris(orte_parent_uri, ORTE_PROC_MY_PARENT, NULL);
 693         if (ORTE_SUCCESS != ret) {
 694             ORTE_ERROR_LOG(ret);
 695             goto DONE;
 696         }
 697         OBJ_CONSTRUCT(&val, opal_value_t);
 698         val.key = OPAL_PMIX_PROC_URI;
 699         val.type = OPAL_STRING;
 700         val.data.string = orte_parent_uri;
 701         if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_PARENT, &val))) {
 702             ORTE_ERROR_LOG(ret);
 703             val.key = NULL;
 704             val.data.string = NULL;
 705             OBJ_DESTRUCT(&val);
 706             goto DONE;
 707         }
 708         val.key = NULL;
 709         val.data.string = NULL;
 710         OBJ_DESTRUCT(&val);
 711 
 712         
 713 
 714 
 715         if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_PARENT))) {
 716             ORTE_ERROR_LOG(ret);
 717             goto DONE;
 718         }
 719         
 720         if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_PARENT, ORTE_PROC_MY_PARENT))) {
 721             ORTE_ERROR_LOG(ret);
 722             goto DONE;
 723         }
 724         
 725 
 726 
 727         if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(ORTE_PROC_MY_PARENT))) {
 728             ORTE_ERROR_LOG(ret);
 729             goto DONE;
 730         }
 731     }
 732 
 733     
 734 
 735 
 736 
 737     if (!ORTE_PROC_IS_HNP) {
 738         orte_process_name_t target;
 739 
 740         
 741         orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
 742                                 ORTE_RML_PERSISTENT, rollup, NULL);
 743 
 744         
 745         target.jobid = ORTE_PROC_MY_NAME->jobid;
 746         if (NULL != orte_parent_uri) {
 747             
 748             target.vpid = ORTE_PROC_MY_NAME->vpid;
 749             
 750 
 751 
 752 
 753             buffer = OBJ_NEW(opal_buffer_t);  
 754             node_regex_waiting = true;
 755             orte_rml.recv_buffer_nb(ORTE_PROC_MY_PARENT, ORTE_RML_TAG_NODE_REGEX_REPORT,
 756                                     ORTE_RML_PERSISTENT, node_regex_report, &node_regex_waiting);
 757             if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, buffer,
 758                                                    ORTE_RML_TAG_WARMUP_CONNECTION,
 759                                                    orte_rml_send_callback, NULL))) {
 760                 ORTE_ERROR_LOG(ret);
 761                 OBJ_RELEASE(buffer);
 762                 goto DONE;
 763             }
 764         } else {
 765             target.vpid = 0;
 766         }
 767 
 768         
 769 
 770 
 771 
 772 
 773 
 774 
 775         buffer = OBJ_NEW(opal_buffer_t);
 776         
 777         if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
 778             ORTE_ERROR_LOG(ret);
 779             OBJ_RELEASE(buffer);
 780             goto DONE;
 781         }
 782 
 783         
 784         {
 785             opal_value_t *vptr = NULL, *kv;
 786             opal_list_t *modex;
 787             int32_t flag;
 788 
 789             if (opal_pmix.legacy_get()) {
 790                 if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &vptr)) || NULL == vptr) {
 791                     
 792                     flag = 0;
 793                     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
 794                         ORTE_ERROR_LOG(ret);
 795                         OBJ_RELEASE(buffer);
 796                         goto DONE;
 797                     }
 798                 } else {
 799                     flag = 1;
 800                     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
 801                         ORTE_ERROR_LOG(ret);
 802                         OBJ_RELEASE(buffer);
 803                         goto DONE;
 804                     }
 805                     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &vptr, 1, OPAL_VALUE))) {
 806                         ORTE_ERROR_LOG(ret);
 807                         OBJ_RELEASE(buffer);
 808                         goto DONE;
 809                     }
 810                     OBJ_RELEASE(vptr);
 811                 }
 812             } else {
 813                 if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &vptr)) || NULL == vptr) {
 814                     
 815                     flag = 0;
 816                     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
 817                         ORTE_ERROR_LOG(ret);
 818                         OBJ_RELEASE(buffer);
 819                         goto DONE;
 820                     }
 821                 } else {
 822                     
 823                     if (OPAL_PTR == vptr->type) {
 824                         modex = (opal_list_t*)vptr->data.ptr;
 825                         flag = (int32_t)opal_list_get_size(modex);
 826                         if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
 827                             ORTE_ERROR_LOG(ret);
 828                             OBJ_RELEASE(buffer);
 829                             goto DONE;
 830                         }
 831                         OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
 832                             if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
 833                                 ORTE_ERROR_LOG(ret);
 834                                 OBJ_RELEASE(buffer);
 835                                 goto DONE;
 836                             }
 837                         }
 838                         OPAL_LIST_RELEASE(modex);
 839                     } else {
 840                         
 841                         flag = 1;
 842                         if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
 843                             ORTE_ERROR_LOG(ret);
 844                             OBJ_RELEASE(buffer);
 845                             goto DONE;
 846                         }
 847                         if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &vptr, 1, OPAL_VALUE))) {
 848                             ORTE_ERROR_LOG(ret);
 849                             OBJ_RELEASE(buffer);
 850                             goto DONE;
 851                         }
 852                         OBJ_RELEASE(vptr);
 853                     }
 854                 }
 855             }
 856         }
 857 
 858         
 859         opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
 860 
 861         
 862         if (orte_retain_aliases) {
 863             char **aliases=NULL;
 864             uint8_t naliases, ni;
 865             char hostname[OPAL_MAXHOSTNAMELEN];
 866 
 867             
 868 
 869 
 870             gethostname(hostname, sizeof(hostname));
 871             if (strlen(orte_process_info.nodename) < strlen(hostname)) {
 872                 opal_argv_append_nosize(&aliases, hostname);
 873             }
 874             opal_ifgetaliases(&aliases);
 875             naliases = opal_argv_count(aliases);
 876             if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
 877                 ORTE_ERROR_LOG(ret);
 878                 OBJ_RELEASE(buffer);
 879                 opal_argv_free(aliases);
 880                 goto DONE;
 881             }
 882             for (ni=0; ni < naliases; ni++) {
 883                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
 884                     ORTE_ERROR_LOG(ret);
 885                     OBJ_RELEASE(buffer);
 886                     opal_argv_free(aliases);
 887                     goto DONE;
 888                 }
 889             }
 890             opal_argv_free(aliases);
 891         }
 892 
 893         
 894 
 895         if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &orte_topo_signature, 1, OPAL_STRING))) {
 896             ORTE_ERROR_LOG(ret);
 897         }
 898 
 899         
 900 
 901         if (1 == ORTE_PROC_MY_NAME->vpid) {
 902             opal_buffer_t data;
 903             int8_t flag;
 904             uint8_t *cmpdata;
 905             size_t cmplen;
 906 
 907             
 908             OBJ_CONSTRUCT(&data, opal_buffer_t);
 909 
 910             if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
 911                 ORTE_ERROR_LOG(ret);
 912             }
 913             if (opal_compress.compress_block((uint8_t*)data.base_ptr, data.bytes_used,
 914                                  &cmpdata, &cmplen)) {
 915                 
 916                 flag = 1;
 917                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) {
 918                     ORTE_ERROR_LOG(ret);
 919                     free(cmpdata);
 920                     OBJ_DESTRUCT(&data);
 921                 }
 922                 
 923                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &cmplen, 1, OPAL_SIZE))) {
 924                     ORTE_ERROR_LOG(ret);
 925                     free(cmpdata);
 926                     OBJ_DESTRUCT(&data);
 927                 }
 928                 
 929                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &data.bytes_used, 1, OPAL_SIZE))) {
 930                     ORTE_ERROR_LOG(ret);
 931                     free(cmpdata);
 932                     OBJ_DESTRUCT(&data);
 933                 }
 934                 
 935                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, cmpdata, cmplen, OPAL_UINT8))) {
 936                     ORTE_ERROR_LOG(ret);
 937                     free(cmpdata);
 938                     OBJ_DESTRUCT(&data);
 939                 }
 940                 OBJ_DESTRUCT(&data);
 941                 free(cmpdata);
 942             } else {
 943                 
 944                 flag = 0;
 945                 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) {
 946                     ORTE_ERROR_LOG(ret);
 947                     OBJ_DESTRUCT(&data);
 948                     free(cmpdata);
 949                 }
 950                 
 951                 opal_dss.copy_payload(buffer, &data);
 952                 OBJ_DESTRUCT(&data);
 953             }
 954         }
 955 
 956         
 957         if (0 > (ret = orte_rml.send_buffer_nb(&target, buffer,
 958                                                ORTE_RML_TAG_ORTED_CALLBACK,
 959                                                orte_rml_send_callback, NULL))) {
 960             ORTE_ERROR_LOG(ret);
 961             OBJ_RELEASE(buffer);
 962             goto DONE;
 963         }
 964     }
 965 
 966     
 967 
 968 
 969 
 970     if (orted_globals.tree_spawn) {
 971         int j, k;
 972         bool ignore;
 973         char *no_keep[] = {
 974             "orte_hnp_uri",
 975             "orte_ess_jobid",
 976             "orte_ess_vpid",
 977             "orte_ess_num_procs",
 978             "orte_parent_uri",
 979             "mca_base_env_list",
 980             NULL
 981         };
 982         for (i=0; i < argc; i++) {
 983             if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID,  argv[i]) ||
 984                 0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) {
 985                 ignore = false;
 986                 
 987                 for (k=0; NULL != no_keep[k]; k++) {
 988                     if (0 == strcmp(no_keep[k], argv[i+1])) {
 989                         ignore = true;
 990                         break;
 991                     }
 992                 }
 993                 if (!ignore) {
 994                     
 995 
 996 
 997                     if (NULL != orted_cmd_line) {
 998                         for (j=0; NULL != orted_cmd_line[j]; j++) {
 999                             if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
1000                                 
1001                                 ignore = true;
1002                                 break;
1003                             }
1004                         }
1005                     }
1006                     if (!ignore) {
1007                         opal_argv_append_nosize(&orted_cmd_line, argv[i]);
1008                         opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
1009                         opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
1010                     }
1011                 }
1012                 i += 2;
1013             }
1014         }
1015     }
1016 
1017     if (orte_debug_daemons_flag) {
1018         opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1019     }
1020     ret = ORTE_SUCCESS;
1021 
1022     
1023     while (orte_event_base_active) {
1024         opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
1025     }
1026     ORTE_ACQUIRE_OBJECT(orte_event_base_active);
1027 
1028     
1029     orte_odls.kill_local_procs(NULL);
1030 
1031  DONE:
1032     
1033     ORTE_UPDATE_EXIT_STATUS(ret);
1034 
1035     
1036     orte_finalize();
1037     opal_finalize_util();
1038 
1039     orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1040     
1041     orte_proc_info_finalize();
1042 
1043     if (orte_debug_flag) {
1044         fprintf(stderr, "exiting with status %d\n", orte_exit_status);
1045     }
1046     exit(orte_exit_status);
1047 }
1048 
1049 static void pipe_closed(int fd, short flags, void *arg)
1050 {
1051     opal_event_t *ev = (opal_event_t*)arg;
1052 
1053     
1054     opal_event_free(ev);
1055     ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
1056 }
1057 
1058 static void shutdown_callback(int fd, short flags, void *arg)
1059 {
1060     orte_timer_t *tm = (orte_timer_t*)arg;
1061 
1062     if (NULL != tm) {
1063         
1064         OBJ_RELEASE(tm);
1065     }
1066 
1067     
1068     if (orted_globals.abort) {
1069         opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1070                     (orted_globals.test_suicide) ? "suicide" : "clean");
1071         
1072 
1073 
1074 
1075         if (orted_globals.test_suicide) {
1076             exit(1);
1077         }
1078         orte_odls.kill_local_procs(NULL);
1079         orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1080         abort();
1081     }
1082     opal_output(0, "%s is executing clean abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1083     
1084 
1085 
1086 
1087     orte_odls.kill_local_procs(NULL);
1088     orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1089     exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
1090 }
1091 
1092 static void rollup(int status, orte_process_name_t* sender,
1093                    opal_buffer_t *buffer,
1094                    orte_rml_tag_t tag, void *cbdata)
1095 {
1096     int ret;
1097     orte_process_name_t child;
1098     int32_t i, flag, cnt;
1099     opal_value_t *kv;
1100 
1101     ncollected++;
1102 
1103     
1104 
1105     if (sender->jobid == ORTE_PROC_MY_NAME->jobid &&
1106         sender->vpid == ORTE_PROC_MY_NAME->vpid) {
1107         mybucket = OBJ_NEW(opal_buffer_t);
1108         opal_dss.copy_payload(mybucket, buffer);
1109     } else {
1110         
1111         opal_dss.copy_payload(bucket, buffer);
1112         
1113 
1114         cnt = 1;
1115         if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &child, &cnt, ORTE_NAME))) {
1116             ORTE_ERROR_LOG(ret);
1117             goto report;
1118         }
1119         cnt = 1;
1120         if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT32))) {
1121             ORTE_ERROR_LOG(ret);
1122             goto report;
1123         }
1124         for (i=0; i < flag; i++) {
1125             cnt = 1;
1126             if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &kv, &cnt, OPAL_VALUE))) {
1127                 ORTE_ERROR_LOG(ret);
1128                 goto report;
1129             }
1130             
1131             opal_pmix.store_local(&child, kv);
1132             OBJ_RELEASE(kv);
1133         }
1134     }
1135 
1136   report:
1137     report_orted();
1138 }
1139 
1140 static void report_orted() {
1141     int nreqd, ret;
1142 
1143     
1144     nreqd = orte_routed.num_routes() + 1;
1145     if (nreqd == ncollected && NULL != mybucket && !node_regex_waiting) {
1146         
1147         opal_dss.copy_payload(mybucket, bucket);
1148         OBJ_RELEASE(bucket);
1149         
1150         if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, mybucket,
1151                                                ORTE_RML_TAG_ORTED_CALLBACK,
1152                                                orte_rml_send_callback, NULL))) {
1153             ORTE_ERROR_LOG(ret);
1154             OBJ_RELEASE(mybucket);
1155         }
1156     }
1157 }
1158 
1159 static void node_regex_report(int status, orte_process_name_t* sender,
1160                               opal_buffer_t *buffer,
1161                               orte_rml_tag_t tag, void *cbdata) {
1162     int rc;
1163     bool * active = (bool *)cbdata;
1164 
1165     
1166     if (ORTE_SUCCESS != (rc = orte_util_decode_nidmap(buffer))) {
1167         ORTE_ERROR_LOG(rc);
1168         return;
1169     }
1170 
1171     
1172 
1173     orte_routed.update_routing_plan();
1174 
1175     *active = false;
1176 
1177     
1178     orte_plm.remote_spawn();
1179 
1180     report_orted();
1181 }