This source file includes following definitions.
- orte_daemon
- pipe_closed
- shutdown_callback
- rollup
- report_orted
- node_regex_report
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 #include "orte_config.h"
30 #include "orte/constants.h"
31
32 #include <string.h>
33
34 #include <stdio.h>
35 #include <ctype.h>
36 #ifdef HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #ifdef HAVE_NETDB_H
40 #include <netdb.h>
41 #endif
42 #ifdef HAVE_SYS_PARAM_H
43 #include <sys/param.h>
44 #endif
45 #include <fcntl.h>
46 #include <errno.h>
47 #include <signal.h>
48 #ifdef HAVE_SYS_TIME_H
49 #include <sys/time.h>
50 #endif
51
52
53 #include "opal/mca/event/event.h"
54 #include "opal/mca/base/base.h"
55 #include "opal/util/output.h"
56 #include "opal/util/cmd_line.h"
57 #include "opal/util/if.h"
58 #include "opal/util/net.h"
59 #include "opal/util/opal_environ.h"
60 #include "opal/util/os_path.h"
61 #include "opal/util/printf.h"
62 #include "opal/util/argv.h"
63 #include "opal/util/fd.h"
64 #include "opal/runtime/opal.h"
65 #include "opal/mca/base/mca_base_var.h"
66 #include "opal/util/daemon_init.h"
67 #include "opal/dss/dss.h"
68 #include "opal/mca/hwloc/hwloc-internal.h"
69 #include "opal/mca/pmix/pmix.h"
70 #include "opal/mca/compress/compress.h"
71
72 #include "orte/util/show_help.h"
73 #include "orte/util/proc_info.h"
74 #include "orte/util/session_dir.h"
75 #include "orte/util/name_fns.h"
76 #include "orte/util/nidmap.h"
77 #include "orte/util/parse_options.h"
78 #include "orte/mca/rml/base/rml_contact.h"
79 #include "orte/util/pre_condition_transports.h"
80 #include "orte/util/threads.h"
81
82 #include "orte/mca/errmgr/errmgr.h"
83 #include "orte/mca/ess/ess.h"
84 #include "orte/mca/grpcomm/grpcomm.h"
85 #include "orte/mca/grpcomm/base/base.h"
86 #include "orte/mca/rml/rml.h"
87 #include "orte/mca/rml/rml_types.h"
88 #include "orte/mca/odls/odls.h"
89 #include "orte/mca/odls/base/odls_private.h"
90 #include "orte/mca/oob/base/base.h"
91 #include "orte/mca/plm/plm.h"
92 #include "orte/mca/ras/ras.h"
93 #include "orte/mca/routed/routed.h"
94 #include "orte/mca/rmaps/rmaps_types.h"
95 #include "orte/mca/state/state.h"
96
97
98
99
100 #include "orte/mca/plm/base/plm_private.h"
101
102 #include "orte/runtime/runtime.h"
103 #include "orte/runtime/orte_globals.h"
104 #include "orte/runtime/orte_locks.h"
105 #include "orte/runtime/orte_quit.h"
106 #include "orte/runtime/orte_wait.h"
107
108 #include "orte/orted/orted.h"
109 #include "orte/orted/pmix/pmix_server.h"
110
111
112
113
114 static opal_event_t *pipe_handler;
115 static void shutdown_callback(int fd, short flags, void *arg);
116 static void pipe_closed(int fd, short flags, void *arg);
117 static void rollup(int status, orte_process_name_t* sender,
118 opal_buffer_t *buffer,
119 orte_rml_tag_t tag, void *cbdata);
120 static void node_regex_report(int status, orte_process_name_t* sender,
121 opal_buffer_t *buffer,
122 orte_rml_tag_t tag, void *cbdata);
123 static void report_orted(void);
124
125 static opal_buffer_t *bucket, *mybucket = NULL;
126 static int ncollected = 0;
127 static bool node_regex_waiting = false;
128
129 static char *orte_parent_uri = NULL;
130
131 static struct {
132 bool debug;
133 bool help;
134 bool set_sid;
135 bool hnp;
136 bool daemonize;
137 char* name;
138 char* vpid_start;
139 char* num_procs;
140 int uri_pipe;
141 int singleton_died_pipe;
142 bool abort;
143 bool tree_spawn;
144 bool test_suicide;
145 } orted_globals;
146
147
148
149
150 opal_cmd_line_init_t orte_cmd_line_opts[] = {
151
152 { NULL, 'h', NULL, "help", 0,
153 &orted_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
154 "This help message" },
155
156 { "orte_daemon_spin", 's', NULL, "spin", 0,
157 &orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
158 "Have the orted spin until we can connect a debugger to it" },
159
160 { NULL, '\0', NULL, "test-suicide", 1,
161 &orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL,
162 "Suicide instead of clean abort after delay" },
163
164 { "orte_debug", 'd', NULL, "debug", 0,
165 NULL, OPAL_CMD_LINE_TYPE_BOOL,
166 "Debug the OpenRTE" },
167
168 { "orte_daemonize", '\0', NULL, "daemonize", 0,
169 &orted_globals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
170 "Daemonize the orted into the background" },
171
172 { "orte_debug_daemons", '\0', NULL, "debug-daemons", 0,
173 &orted_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
174 "Enable debugging of OpenRTE daemons" },
175
176 { "orte_debug_daemons_file", '\0', NULL, "debug-daemons-file", 0,
177 NULL, OPAL_CMD_LINE_TYPE_BOOL,
178 "Enable debugging of OpenRTE daemons, storing output in files" },
179
180 { NULL, '\0', NULL, "hnp", 0,
181 &orted_globals.hnp, OPAL_CMD_LINE_TYPE_BOOL,
182 "Direct the orted to act as the HNP"},
183
184 { "orte_hnp_uri", '\0', NULL, "hnp-uri", 1,
185 NULL, OPAL_CMD_LINE_TYPE_STRING,
186 "URI for the HNP"},
187
188 { "orte_parent_uri", '\0', NULL, "parent-uri", 1,
189 NULL, OPAL_CMD_LINE_TYPE_STRING,
190 "URI for the parent if tree launch is enabled."},
191
192 { NULL, '\0', NULL, "set-sid", 0,
193 &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
194 "Direct the orted to separate from the current session"},
195
196 { NULL, '\0', "tree-spawn", "tree-spawn", 0,
197 &orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
198 "Tree-based spawn in progress" },
199
200 { "tmpdir_base", '\0', NULL, "tmpdir", 1,
201 NULL, OPAL_CMD_LINE_TYPE_STRING,
202 "Set the root for the session directory tree" },
203
204 { NULL, '\0', NULL, "report-uri", 1,
205 &orted_globals.uri_pipe, OPAL_CMD_LINE_TYPE_INT,
206 "Report this process' uri on indicated pipe"},
207
208 { NULL, '\0', NULL, "singleton-died-pipe", 1,
209 &orted_globals.singleton_died_pipe, OPAL_CMD_LINE_TYPE_INT,
210 "Watch on indicated pipe for singleton termination"},
211
212 { "orte_output_filename", '\0', "output-filename", "output-filename", 1,
213 NULL, OPAL_CMD_LINE_TYPE_STRING,
214 "Redirect output from application processes into filename.rank" },
215
216 { "orte_xterm", '\0', "xterm", "xterm", 1,
217 NULL, OPAL_CMD_LINE_TYPE_STRING,
218 "Create a new xterm window and display output from the specified ranks there" },
219
220 { "orte_report_bindings", '\0', "report-bindings", "report-bindings", 0,
221 NULL, OPAL_CMD_LINE_TYPE_BOOL,
222 "Whether to report process bindings to stderr" },
223
224
225 { NULL, '\0', NULL, NULL, 0,
226 NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
227 };
228
229 int orte_daemon(int argc, char *argv[])
230 {
231 int ret = 0;
232 opal_cmd_line_t *cmd_line = NULL;
233 int i;
234 opal_buffer_t *buffer;
235 char hostname[OPAL_MAXHOSTNAMELEN];
236 #if OPAL_ENABLE_FT_CR == 1
237 char *tmp_env_var = NULL;
238 #endif
239 opal_value_t val;
240
241
242 memset(&orted_globals, 0, sizeof(orted_globals));
243
244 orted_globals.singleton_died_pipe = -1;
245 bucket = OBJ_NEW(opal_buffer_t);
246
247
248 cmd_line = OBJ_NEW(opal_cmd_line_t);
249 if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
250 OBJ_RELEASE(cmd_line);
251 exit(1);
252 }
253 mca_base_cmd_line_setup(cmd_line);
254 if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
255 argc, argv))) {
256 char *args = NULL;
257 args = opal_cmd_line_get_usage_msg(cmd_line);
258 fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
259 free(args);
260 OBJ_RELEASE(cmd_line);
261 return ret;
262 }
263
264
265
266
267
268 mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
269
270
271
272
273
274
275
276
277
278
279
280
281
282 if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
283 fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
284 exit(1);
285 }
286
287
288
289
290
291
292
293 orte_launch_environ = opal_argv_copy(environ);
294
295
296 opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
297 opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);
298
299
300
301
302 if (orted_globals.debug) {
303 gethostname(hostname, sizeof(hostname));
304 fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
305 }
306
307
308 if (orted_globals.help) {
309 char *args = NULL;
310 args = opal_cmd_line_get_usage_msg(cmd_line);
311 orte_show_help("help-orted.txt", "orted:usage", false,
312 argv[0], args);
313 free(args);
314 return 1;
315 }
316 #if defined(HAVE_SETSID)
317
318 if (orted_globals.set_sid) {
319 setsid();
320 }
321 #endif
322
323 i=0;
324 while (orted_spin_flag) {
325 i++;
326 if (1000 < i) i=0;
327 }
328
329 #if OPAL_ENABLE_FT_CR == 1
330
331 (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
332 opal_setenv(tmp_env_var,
333 "1",
334 true, &environ);
335 free(tmp_env_var);
336 #endif
337
338
339
340
341 if(!orte_debug_flag &&
342 !orte_debug_daemons_flag &&
343 orted_globals.daemonize) {
344 opal_daemon_init(NULL);
345 }
346
347
348
349
350
351
352 if (orted_globals.hnp) {
353 if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
354 ORTE_ERROR_LOG(ret);
355 return ret;
356 }
357 } else {
358 if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
359 ORTE_ERROR_LOG(ret);
360 return ret;
361 }
362 }
363
364
365
366
367 opal_finalize_util();
368
369
370 if (NULL != orte_daemon_cores) {
371 char **cores=NULL, tmp[128];
372 hwloc_obj_t pu;
373 hwloc_cpuset_t ours, res;
374 int core;
375
376
377
378
379 orte_util_parse_range_options(orte_daemon_cores, &cores);
380 if (NULL != cores) {
381 ours = hwloc_bitmap_alloc();
382 hwloc_bitmap_zero(ours);
383 res = hwloc_bitmap_alloc();
384 for (i=0; NULL != cores[i]; i++) {
385 core = strtoul(cores[i], NULL, 10);
386 if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
387
388
389
390 orte_show_help_finalize();
391
392 orte_show_help("help-orted.txt", "orted:cannot-bind",
393 true, orte_process_info.nodename,
394 orte_daemon_cores);
395 ret = ORTE_ERR_NOT_SUPPORTED;
396 hwloc_bitmap_free(ours);
397 hwloc_bitmap_free(res);
398 goto DONE;
399 }
400 hwloc_bitmap_or(res, ours, pu->cpuset);
401 hwloc_bitmap_copy(ours, res);
402 }
403
404 if (!hwloc_bitmap_iszero(ours)) {
405 (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
406 if (opal_hwloc_report_bindings) {
407 opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
408 opal_output(0, "Daemon %s is bound to cores %s",
409 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
410 }
411 }
412
413 hwloc_bitmap_free(ours);
414 hwloc_bitmap_free(res);
415 opal_argv_free(cores);
416 }
417 }
418
419 if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
420 orted_globals.abort=false;
421
422
423
424
425 if (0 > orted_debug_failure) {
426 orted_debug_failure = -1*orted_debug_failure;
427 orted_globals.abort = true;
428 }
429
430 if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
431
432
433
434 if (0 < orted_debug_failure_delay) {
435 ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);
436
437 } else {
438 opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
439 orted_globals.abort ? "abort" : "abnormal termination");
440
441
442
443
444
445 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
446
447
448 if (orted_globals.abort) {
449 abort();
450 }
451
452
453 ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
454 goto DONE;
455 }
456 }
457 }
458
459
460
461
462 orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
463 if (NULL == orte_process_info.my_daemon_uri) {
464
465 ret = ORTE_ERROR;
466 goto DONE;
467 }
468 ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
469 ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
470 OBJ_CONSTRUCT(&val, opal_value_t);
471 val.key = OPAL_PMIX_PROC_URI;
472 val.type = OPAL_STRING;
473 val.data.string = orte_process_info.my_daemon_uri;
474 if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
475 ORTE_ERROR_LOG(ret);
476 val.key = NULL;
477 val.data.string = NULL;
478 OBJ_DESTRUCT(&val);
479 goto DONE;
480 }
481 val.key = NULL;
482 val.data.string = NULL;
483 OBJ_DESTRUCT(&val);
484
485
486 if (ORTE_PROC_IS_HNP) {
487 orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
488 ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
489 ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
490 }
491
492
493 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
494 ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
495
496
497
498
499 if (orte_debug_daemons_flag) {
500 fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
501 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
502 orte_process_info.nodename);
503 }
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523 opal_progress_set_yield_when_idle(false);
524
525
526
527
528
529
530
531
532
533 opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
534
535
536 if (orted_globals.uri_pipe > 0) {
537 orte_job_t *jdata;
538 orte_proc_t *proc;
539 orte_node_t *node;
540 orte_app_context_t *app;
541 char *tmp, *nptr, *sysinfo;
542 char **singenv=NULL, *string_key, *env_str;
543
544
545 jdata = OBJ_NEW(orte_job_t);
546
547 opal_argv_append_nosize(&jdata->personality, "ompi");
548 orte_plm_base_create_jobid(jdata);
549 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
550
551
552
553
554
555
556 jdata->map = OBJ_NEW(orte_job_map_t);
557
558
559 app = OBJ_NEW(orte_app_context_t);
560 app->app = strdup("singleton");
561 app->num_procs = 1;
562 opal_pointer_array_add(jdata->apps, app);
563 jdata->num_apps = 1;
564
565
566
567
568
569
570 proc = OBJ_NEW(orte_proc_t);
571 proc->name.jobid = jdata->jobid;
572 proc->name.vpid = 0;
573 proc->parent = 0;
574 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
575 proc->state = ORTE_PROC_STATE_RUNNING;
576 proc->app_idx = 0;
577
578 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
579 proc->node = node;
580 OBJ_RETAIN(node);
581 opal_pointer_array_add(jdata->procs, proc);
582 jdata->num_procs = 1;
583
584 OBJ_RETAIN(node);
585 opal_pointer_array_add(jdata->map->nodes, node);
586 jdata->map->num_nodes++;
587
588 OBJ_RETAIN(proc);
589 opal_pointer_array_add(node->procs, proc);
590 node->num_procs++;
591
592 OBJ_RETAIN(proc);
593 opal_pointer_array_add(orte_local_children, proc);
594 jdata->num_local_procs = 1;
595
596 proc->local_rank = 0;
597 proc->node_rank = 0;
598 proc->app_rank = 0;
599 proc->state = ORTE_PROC_STATE_RUNNING;
600 proc->app_idx = 0;
601 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
602
603
604 orte_pre_condition_transports(jdata, NULL);
605
606
607 if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
608 ORTE_ERROR_LOG(ret);
609 goto DONE;
610 }
611
612 if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
613 ORTE_ERROR_LOG(ret);
614 goto DONE;
615 }
616
617
618 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
619 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
620 goto DONE;
621 }
622 opal_asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
623 opal_argv_append_nosize(&singenv, env_str);
624 free(env_str);
625
626 nptr = opal_argv_join(singenv, '*');
627 opal_argv_free(singenv);
628
629
630 orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
631 opal_asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
632 free(sysinfo);
633 free(nptr);
634
635
636 if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ;
637 ORTE_ERROR_LOG(ret);
638 goto DONE;
639 }
640
641
642 free(tmp);
643 close(orted_globals.uri_pipe);
644
645
646
647
648
649
650 for (i=0; NULL != environ[i]; i++) {
651 if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
652
653 tmp = strdup(environ[i]);
654
655 nptr = strchr(tmp, '=');
656 *nptr = '\0';
657 nptr++;
658
659 opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
660 opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
661 opal_argv_append_nosize(&orted_cmd_line, nptr);
662 free(tmp);
663 }
664 }
665 }
666
667
668 if (orted_globals.singleton_died_pipe > 0) {
669
670 pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
671 opal_event_set(orte_event_base, pipe_handler,
672 orted_globals.singleton_died_pipe,
673 OPAL_EV_READ,
674 pipe_closed,
675 pipe_handler);
676 opal_event_add(pipe_handler, NULL);
677 }
678
679
680
681
682 orte_parent_uri = NULL;
683 (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
684 "URI for the parent if tree launch is enabled.",
685 MCA_BASE_VAR_TYPE_STRING, NULL, 0,
686 MCA_BASE_VAR_FLAG_INTERNAL,
687 OPAL_INFO_LVL_9,
688 MCA_BASE_VAR_SCOPE_CONSTANT,
689 &orte_parent_uri);
690 if (NULL != orte_parent_uri) {
691
692 ret = orte_rml_base_parse_uris(orte_parent_uri, ORTE_PROC_MY_PARENT, NULL);
693 if (ORTE_SUCCESS != ret) {
694 ORTE_ERROR_LOG(ret);
695 goto DONE;
696 }
697 OBJ_CONSTRUCT(&val, opal_value_t);
698 val.key = OPAL_PMIX_PROC_URI;
699 val.type = OPAL_STRING;
700 val.data.string = orte_parent_uri;
701 if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_PARENT, &val))) {
702 ORTE_ERROR_LOG(ret);
703 val.key = NULL;
704 val.data.string = NULL;
705 OBJ_DESTRUCT(&val);
706 goto DONE;
707 }
708 val.key = NULL;
709 val.data.string = NULL;
710 OBJ_DESTRUCT(&val);
711
712
713
714
715 if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_PARENT))) {
716 ORTE_ERROR_LOG(ret);
717 goto DONE;
718 }
719
720 if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_PARENT, ORTE_PROC_MY_PARENT))) {
721 ORTE_ERROR_LOG(ret);
722 goto DONE;
723 }
724
725
726
727 if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(ORTE_PROC_MY_PARENT))) {
728 ORTE_ERROR_LOG(ret);
729 goto DONE;
730 }
731 }
732
733
734
735
736
737 if (!ORTE_PROC_IS_HNP) {
738 orte_process_name_t target;
739
740
741 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
742 ORTE_RML_PERSISTENT, rollup, NULL);
743
744
745 target.jobid = ORTE_PROC_MY_NAME->jobid;
746 if (NULL != orte_parent_uri) {
747
748 target.vpid = ORTE_PROC_MY_NAME->vpid;
749
750
751
752
753 buffer = OBJ_NEW(opal_buffer_t);
754 node_regex_waiting = true;
755 orte_rml.recv_buffer_nb(ORTE_PROC_MY_PARENT, ORTE_RML_TAG_NODE_REGEX_REPORT,
756 ORTE_RML_PERSISTENT, node_regex_report, &node_regex_waiting);
757 if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, buffer,
758 ORTE_RML_TAG_WARMUP_CONNECTION,
759 orte_rml_send_callback, NULL))) {
760 ORTE_ERROR_LOG(ret);
761 OBJ_RELEASE(buffer);
762 goto DONE;
763 }
764 } else {
765 target.vpid = 0;
766 }
767
768
769
770
771
772
773
774
775 buffer = OBJ_NEW(opal_buffer_t);
776
777 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
778 ORTE_ERROR_LOG(ret);
779 OBJ_RELEASE(buffer);
780 goto DONE;
781 }
782
783
784 {
785 opal_value_t *vptr = NULL, *kv;
786 opal_list_t *modex;
787 int32_t flag;
788
789 if (opal_pmix.legacy_get()) {
790 if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &vptr)) || NULL == vptr) {
791
792 flag = 0;
793 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
794 ORTE_ERROR_LOG(ret);
795 OBJ_RELEASE(buffer);
796 goto DONE;
797 }
798 } else {
799 flag = 1;
800 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
801 ORTE_ERROR_LOG(ret);
802 OBJ_RELEASE(buffer);
803 goto DONE;
804 }
805 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &vptr, 1, OPAL_VALUE))) {
806 ORTE_ERROR_LOG(ret);
807 OBJ_RELEASE(buffer);
808 goto DONE;
809 }
810 OBJ_RELEASE(vptr);
811 }
812 } else {
813 if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &vptr)) || NULL == vptr) {
814
815 flag = 0;
816 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
817 ORTE_ERROR_LOG(ret);
818 OBJ_RELEASE(buffer);
819 goto DONE;
820 }
821 } else {
822
823 if (OPAL_PTR == vptr->type) {
824 modex = (opal_list_t*)vptr->data.ptr;
825 flag = (int32_t)opal_list_get_size(modex);
826 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
827 ORTE_ERROR_LOG(ret);
828 OBJ_RELEASE(buffer);
829 goto DONE;
830 }
831 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
832 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
833 ORTE_ERROR_LOG(ret);
834 OBJ_RELEASE(buffer);
835 goto DONE;
836 }
837 }
838 OPAL_LIST_RELEASE(modex);
839 } else {
840
841 flag = 1;
842 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
843 ORTE_ERROR_LOG(ret);
844 OBJ_RELEASE(buffer);
845 goto DONE;
846 }
847 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &vptr, 1, OPAL_VALUE))) {
848 ORTE_ERROR_LOG(ret);
849 OBJ_RELEASE(buffer);
850 goto DONE;
851 }
852 OBJ_RELEASE(vptr);
853 }
854 }
855 }
856 }
857
858
859 opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
860
861
862 if (orte_retain_aliases) {
863 char **aliases=NULL;
864 uint8_t naliases, ni;
865 char hostname[OPAL_MAXHOSTNAMELEN];
866
867
868
869
870 gethostname(hostname, sizeof(hostname));
871 if (strlen(orte_process_info.nodename) < strlen(hostname)) {
872 opal_argv_append_nosize(&aliases, hostname);
873 }
874 opal_ifgetaliases(&aliases);
875 naliases = opal_argv_count(aliases);
876 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
877 ORTE_ERROR_LOG(ret);
878 OBJ_RELEASE(buffer);
879 opal_argv_free(aliases);
880 goto DONE;
881 }
882 for (ni=0; ni < naliases; ni++) {
883 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
884 ORTE_ERROR_LOG(ret);
885 OBJ_RELEASE(buffer);
886 opal_argv_free(aliases);
887 goto DONE;
888 }
889 }
890 opal_argv_free(aliases);
891 }
892
893
894
895 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &orte_topo_signature, 1, OPAL_STRING))) {
896 ORTE_ERROR_LOG(ret);
897 }
898
899
900
901 if (1 == ORTE_PROC_MY_NAME->vpid) {
902 opal_buffer_t data;
903 int8_t flag;
904 uint8_t *cmpdata;
905 size_t cmplen;
906
907
908 OBJ_CONSTRUCT(&data, opal_buffer_t);
909
910 if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
911 ORTE_ERROR_LOG(ret);
912 }
913 if (opal_compress.compress_block((uint8_t*)data.base_ptr, data.bytes_used,
914 &cmpdata, &cmplen)) {
915
916 flag = 1;
917 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) {
918 ORTE_ERROR_LOG(ret);
919 free(cmpdata);
920 OBJ_DESTRUCT(&data);
921 }
922
923 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &cmplen, 1, OPAL_SIZE))) {
924 ORTE_ERROR_LOG(ret);
925 free(cmpdata);
926 OBJ_DESTRUCT(&data);
927 }
928
929 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &data.bytes_used, 1, OPAL_SIZE))) {
930 ORTE_ERROR_LOG(ret);
931 free(cmpdata);
932 OBJ_DESTRUCT(&data);
933 }
934
935 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, cmpdata, cmplen, OPAL_UINT8))) {
936 ORTE_ERROR_LOG(ret);
937 free(cmpdata);
938 OBJ_DESTRUCT(&data);
939 }
940 OBJ_DESTRUCT(&data);
941 free(cmpdata);
942 } else {
943
944 flag = 0;
945 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) {
946 ORTE_ERROR_LOG(ret);
947 OBJ_DESTRUCT(&data);
948 free(cmpdata);
949 }
950
951 opal_dss.copy_payload(buffer, &data);
952 OBJ_DESTRUCT(&data);
953 }
954 }
955
956
957 if (0 > (ret = orte_rml.send_buffer_nb(&target, buffer,
958 ORTE_RML_TAG_ORTED_CALLBACK,
959 orte_rml_send_callback, NULL))) {
960 ORTE_ERROR_LOG(ret);
961 OBJ_RELEASE(buffer);
962 goto DONE;
963 }
964 }
965
966
967
968
969
970 if (orted_globals.tree_spawn) {
971 int j, k;
972 bool ignore;
973 char *no_keep[] = {
974 "orte_hnp_uri",
975 "orte_ess_jobid",
976 "orte_ess_vpid",
977 "orte_ess_num_procs",
978 "orte_parent_uri",
979 "mca_base_env_list",
980 NULL
981 };
982 for (i=0; i < argc; i++) {
983 if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID, argv[i]) ||
984 0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) {
985 ignore = false;
986
987 for (k=0; NULL != no_keep[k]; k++) {
988 if (0 == strcmp(no_keep[k], argv[i+1])) {
989 ignore = true;
990 break;
991 }
992 }
993 if (!ignore) {
994
995
996
997 if (NULL != orted_cmd_line) {
998 for (j=0; NULL != orted_cmd_line[j]; j++) {
999 if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
1000
1001 ignore = true;
1002 break;
1003 }
1004 }
1005 }
1006 if (!ignore) {
1007 opal_argv_append_nosize(&orted_cmd_line, argv[i]);
1008 opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
1009 opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
1010 }
1011 }
1012 i += 2;
1013 }
1014 }
1015 }
1016
1017 if (orte_debug_daemons_flag) {
1018 opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1019 }
1020 ret = ORTE_SUCCESS;
1021
1022
1023 while (orte_event_base_active) {
1024 opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
1025 }
1026 ORTE_ACQUIRE_OBJECT(orte_event_base_active);
1027
1028
1029 orte_odls.kill_local_procs(NULL);
1030
1031 DONE:
1032
1033 ORTE_UPDATE_EXIT_STATUS(ret);
1034
1035
1036 orte_finalize();
1037 opal_finalize_util();
1038
1039 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1040
1041 orte_proc_info_finalize();
1042
1043 if (orte_debug_flag) {
1044 fprintf(stderr, "exiting with status %d\n", orte_exit_status);
1045 }
1046 exit(orte_exit_status);
1047 }
1048
1049 static void pipe_closed(int fd, short flags, void *arg)
1050 {
1051 opal_event_t *ev = (opal_event_t*)arg;
1052
1053
1054 opal_event_free(ev);
1055 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
1056 }
1057
1058 static void shutdown_callback(int fd, short flags, void *arg)
1059 {
1060 orte_timer_t *tm = (orte_timer_t*)arg;
1061
1062 if (NULL != tm) {
1063
1064 OBJ_RELEASE(tm);
1065 }
1066
1067
1068 if (orted_globals.abort) {
1069 opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1070 (orted_globals.test_suicide) ? "suicide" : "clean");
1071
1072
1073
1074
1075 if (orted_globals.test_suicide) {
1076 exit(1);
1077 }
1078 orte_odls.kill_local_procs(NULL);
1079 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1080 abort();
1081 }
1082 opal_output(0, "%s is executing clean abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1083
1084
1085
1086
1087 orte_odls.kill_local_procs(NULL);
1088 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
1089 exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
1090 }
1091
1092 static void rollup(int status, orte_process_name_t* sender,
1093 opal_buffer_t *buffer,
1094 orte_rml_tag_t tag, void *cbdata)
1095 {
1096 int ret;
1097 orte_process_name_t child;
1098 int32_t i, flag, cnt;
1099 opal_value_t *kv;
1100
1101 ncollected++;
1102
1103
1104
1105 if (sender->jobid == ORTE_PROC_MY_NAME->jobid &&
1106 sender->vpid == ORTE_PROC_MY_NAME->vpid) {
1107 mybucket = OBJ_NEW(opal_buffer_t);
1108 opal_dss.copy_payload(mybucket, buffer);
1109 } else {
1110
1111 opal_dss.copy_payload(bucket, buffer);
1112
1113
1114 cnt = 1;
1115 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &child, &cnt, ORTE_NAME))) {
1116 ORTE_ERROR_LOG(ret);
1117 goto report;
1118 }
1119 cnt = 1;
1120 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT32))) {
1121 ORTE_ERROR_LOG(ret);
1122 goto report;
1123 }
1124 for (i=0; i < flag; i++) {
1125 cnt = 1;
1126 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &kv, &cnt, OPAL_VALUE))) {
1127 ORTE_ERROR_LOG(ret);
1128 goto report;
1129 }
1130
1131 opal_pmix.store_local(&child, kv);
1132 OBJ_RELEASE(kv);
1133 }
1134 }
1135
1136 report:
1137 report_orted();
1138 }
1139
1140 static void report_orted() {
1141 int nreqd, ret;
1142
1143
1144 nreqd = orte_routed.num_routes() + 1;
1145 if (nreqd == ncollected && NULL != mybucket && !node_regex_waiting) {
1146
1147 opal_dss.copy_payload(mybucket, bucket);
1148 OBJ_RELEASE(bucket);
1149
1150 if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, mybucket,
1151 ORTE_RML_TAG_ORTED_CALLBACK,
1152 orte_rml_send_callback, NULL))) {
1153 ORTE_ERROR_LOG(ret);
1154 OBJ_RELEASE(mybucket);
1155 }
1156 }
1157 }
1158
1159 static void node_regex_report(int status, orte_process_name_t* sender,
1160 opal_buffer_t *buffer,
1161 orte_rml_tag_t tag, void *cbdata) {
1162 int rc;
1163 bool * active = (bool *)cbdata;
1164
1165
1166 if (ORTE_SUCCESS != (rc = orte_util_decode_nidmap(buffer))) {
1167 ORTE_ERROR_LOG(rc);
1168 return;
1169 }
1170
1171
1172
1173 orte_routed.update_routing_plan();
1174
1175 *active = false;
1176
1177
1178 orte_plm.remote_spawn();
1179
1180 report_orted();
1181 }