This source file includes following definitions.
- setup_sighandler
- rte_init
- rte_finalize
- rte_abort
- clean_abort
- abort_signal_callback
- epipe_signal_callback
- signal_forward_callback
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 #include "orte_config.h"
29 #include "orte/constants.h"
30
31 #include <sys/types.h>
32 #include <stdio.h>
33 #ifdef HAVE_FCNTL_H
34 #include <fcntl.h>
35 #endif
36 #ifdef HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39
40 #include "opal/hash_string.h"
41 #include "opal/class/opal_hash_table.h"
42 #include "opal/class/opal_list.h"
43 #include "opal/mca/event/event.h"
44 #include "opal/runtime/opal.h"
45
46 #include "opal/util/arch.h"
47 #include "opal/util/argv.h"
48 #include "opal/util/if.h"
49 #include "opal/util/os_path.h"
50 #include "opal/util/output.h"
51 #include "opal/util/opal_environ.h"
52 #include "opal/util/malloc.h"
53 #include "opal/util/basename.h"
54 #include "opal/util/fd.h"
55 #include "opal/mca/pmix/base/base.h"
56 #include "opal/mca/pstat/base/base.h"
57 #include "opal/mca/hwloc/base/base.h"
58
59 #include "orte/mca/oob/base/base.h"
60 #include "orte/mca/rml/base/base.h"
61 #include "orte/mca/rml/rml_types.h"
62 #include "orte/mca/routed/base/base.h"
63 #include "orte/mca/routed/routed.h"
64 #include "orte/mca/rtc/base/base.h"
65 #include "orte/mca/errmgr/base/base.h"
66 #include "orte/mca/grpcomm/base/base.h"
67 #include "orte/mca/iof/base/base.h"
68 #include "orte/mca/ras/base/base.h"
69 #include "orte/mca/plm/base/base.h"
70 #include "orte/mca/plm/plm.h"
71 #include "orte/mca/odls/base/base.h"
72 #include "orte/mca/rmaps/base/base.h"
73 #include "orte/mca/filem/base/base.h"
74 #include "orte/mca/state/base/base.h"
75 #include "orte/mca/state/state.h"
76
77 #include "orte/orted/orted_submit.h"
78 #include "orte/orted/pmix/pmix_server.h"
79
80 #include "orte/util/show_help.h"
81 #include "orte/util/proc_info.h"
82 #include "orte/util/session_dir.h"
83 #include "orte/util/hnp_contact.h"
84 #include "orte/util/name_fns.h"
85 #include "orte/util/show_help.h"
86 #include "orte/util/comm/comm.h"
87
88 #include "orte/runtime/runtime.h"
89 #include "orte/runtime/orte_wait.h"
90 #include "orte/runtime/orte_globals.h"
91 #include "orte/runtime/orte_quit.h"
92 #include "orte/runtime/orte_locks.h"
93
94 #include "orte/mca/ess/ess.h"
95 #include "orte/mca/ess/base/base.h"
96 #include "orte/mca/ess/hnp/ess_hnp.h"
97
98 static int rte_init(void);
99 static int rte_finalize(void);
100 static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
101
102 orte_ess_base_module_t orte_ess_hnp_module = {
103 rte_init,
104 rte_finalize,
105 rte_abort,
106 NULL
107 };
108
109
110 static bool signals_set=false;
111 static bool forcibly_die=false;
112 static opal_event_t term_handler;
113 static opal_event_t epipe_handler;
114 static int term_pipe[2];
115 static opal_event_t *forward_signals_events = NULL;
116
117 static void abort_signal_callback(int signal);
118 static void clean_abort(int fd, short flags, void *arg);
119 static void epipe_signal_callback(int fd, short flags, void *arg);
120 static void signal_forward_callback(int fd, short event, void *arg);
121
122 static void setup_sighandler(int signal, opal_event_t *ev,
123 opal_event_cbfunc_t cbfunc)
124 {
125 opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev);
126 opal_event_set_priority(ev, ORTE_ERROR_PRI);
127 opal_event_signal_add(ev, NULL);
128 }
129
130 static int rte_init(void)
131 {
132 int ret;
133 char *error = NULL;
134 char *contact_path;
135 orte_job_t *jdata;
136 orte_node_t *node;
137 orte_proc_t *proc;
138 orte_app_context_t *app;
139 char **aliases, *aptr;
140 char *coprocessors, **sns;
141 uint32_t h;
142 int idx;
143 orte_topology_t *t;
144 orte_ess_base_signal_t *sig;
145 opal_value_t val;
146
147
148 if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
149 error = "orte_ess_base_std_prolog";
150 goto error;
151 }
152
153
154 setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169 pipe(term_pipe);
170
171 opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL);
172 opal_event_set_priority(&term_handler, ORTE_ERROR_PRI);
173 opal_event_add(&term_handler, NULL);
174
175
176
177 if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS ||
178 opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) {
179 error = "unable to set the pipe to CLOEXEC";
180 goto error;
181 }
182
183
184 signal(SIGTERM, abort_signal_callback);
185 signal(SIGINT, abort_signal_callback);
186 signal(SIGHUP, abort_signal_callback);
187
188
189 if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) {
190 forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx);
191 if (NULL == forward_signals_events) {
192 ret = ORTE_ERR_OUT_OF_RESOURCE;
193 error = "unable to malloc";
194 goto error;
195 }
196 idx = 0;
197 OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
198 setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback);
199 ++idx;
200 }
201 }
202 signals_set = true;
203
204
205 if (NULL == opal_hwloc_topology) {
206 if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
207 error = "topology discovery";
208 goto error;
209 }
210 }
211
212
213 if (orte_xml_output) {
214 fprintf(orte_xml_fp, "<mpirun>\n");
215 fflush(orte_xml_fp);
216 }
217
218
219
220
221 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
222 error = "opal_pstat_base_open";
223 goto error;
224 }
225 if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
226 error = "opal_pstat_base_select";
227 goto error;
228 }
229
230
231 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
232 error = "orte_state_base_open";
233 goto error;
234 }
235 if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
236 error = "orte_state_base_select";
237 goto error;
238 }
239
240
241 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
242 error = "orte_errmgr_base_open";
243 goto error;
244 }
245
246
247
248
249
250
251 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
252 error = "orte_plm_base_open";
253 goto error;
254 }
255 if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
256 error = "orte_plm_base_select";
257 if (ORTE_ERR_FATAL == ret) {
258
259 ret = ORTE_ERR_SILENT;
260 }
261 goto error;
262 }
263
264 if (NULL != orte_ess_base_jobid) {
265 if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
266 error = "convert_string_to_jobid";
267 goto error;
268 }
269 ORTE_PROC_MY_NAME->vpid = 0;
270 } else {
271 if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
272 error = "orte_plm_set_hnp_name";
273 goto error;
274 }
275 }
276
277 orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
278 orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
279 orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
280 orte_process_info.super.proc_arch = opal_local_arch;
281 opal_proc_local_set(&orte_process_info.super);
282
283
284 if (orte_create_session_dirs) {
285 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
286 "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
287 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
288 (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
289 orte_process_info.nodename));
290
291
292
293 if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
294 error = "orte_session_dir define";
295 goto error;
296 }
297
298
299
300 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
301
302
303 if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
304 error = "orte_session_dir";
305 goto error;
306 }
307 }
308
309
310
311 opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
312 if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
313 ORTE_ERROR_LOG(ret);
314 error = "orte_pmix_base_open";
315 goto error;
316 }
317 if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
318 ORTE_ERROR_LOG(ret);
319 error = "opal_pmix_base_select";
320 goto error;
321 }
322
323 opal_pmix_base_set_evbase(orte_event_base);
324
325
326
327 if (ORTE_SUCCESS != (ret = pmix_server_init())) {
328
329 ret = ORTE_ERR_SILENT;
330 error = "pmix_server_init";
331 goto error;
332 }
333
334
335
336
337
338 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
339 ORTE_ERROR_LOG(ret);
340 error = "orte_rml_base_open";
341 goto error;
342 }
343 if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
344 ORTE_ERROR_LOG(ret);
345 error = "orte_routed_base_select";
346 goto error;
347 }
348
349
350
351 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
352 error = "orte_oob_base_open";
353 goto error;
354 }
355 if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
356 error = "orte_oob_base_select";
357 goto error;
358 }
359
360
361
362
363 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
364 error = "orte_rml_base_open";
365 goto error;
366 }
367 if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
368 error = "orte_rml_base_select";
369 goto error;
370 }
371
372
373 pmix_server_start();
374
375
376
377
378 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
379 ORTE_ERROR_LOG(ret);
380 error = "orte_grpcomm_base_open";
381 goto error;
382 }
383 if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
384 ORTE_ERROR_LOG(ret);
385 error = "orte_grpcomm_base_select";
386 goto error;
387 }
388
389
390
391 if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
392 error = "orte_errmgr_base_select";
393 goto error;
394 }
395
396 orte_job_data = OBJ_NEW(opal_hash_table_t);
397 if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
398 ORTE_ERROR_LOG(ret);
399 error = "setup job array";
400 goto error;
401 }
402 orte_node_pool = OBJ_NEW(opal_pointer_array_t);
403 if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
404 ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
405 ORTE_GLOBAL_ARRAY_MAX_SIZE,
406 ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
407 ORTE_ERROR_LOG(ret);
408 error = "setup node array";
409 goto error;
410 }
411 orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
412 if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
413 ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
414 ORTE_GLOBAL_ARRAY_MAX_SIZE,
415 ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
416 ORTE_ERROR_LOG(ret);
417 error = "setup node topologies array";
418 goto error;
419 }
420
421
422 jdata = OBJ_NEW(orte_job_t);
423 jdata->jobid = ORTE_PROC_MY_NAME->jobid;
424 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
425
426
427
428
429 jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
430
431
432 app = OBJ_NEW(orte_app_context_t);
433 opal_pointer_array_set_item(jdata->apps, 0, app);
434 jdata->num_apps++;
435
436 node = OBJ_NEW(orte_node_t);
437 node->name = strdup(orte_process_info.nodename);
438 node->index = ORTE_PROC_MY_NAME->vpid;
439 opal_pointer_array_set_item(orte_node_pool, 0, node);
440
441
442 proc = OBJ_NEW(orte_proc_t);
443 proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
444 proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
445 proc->pid = orte_process_info.pid;
446 orte_oob_base_get_addr(&proc->rml_uri);
447 orte_process_info.my_hnp_uri = strdup(proc->rml_uri);
448
449 OBJ_CONSTRUCT(&val, opal_value_t);
450 val.key = OPAL_PMIX_PROC_URI;
451 val.type = OPAL_STRING;
452 val.data.string = proc->rml_uri;
453 if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
454 ORTE_ERROR_LOG(ret);
455 val.key = NULL;
456 val.data.string = NULL;
457 OBJ_DESTRUCT(&val);
458 error = "store uri";
459 goto error;
460 }
461 val.key = NULL;
462 val.data.string = NULL;
463 OBJ_DESTRUCT(&val);
464
465 orte_process_info.my_daemon_uri = strdup(proc->rml_uri);
466 proc->state = ORTE_PROC_STATE_RUNNING;
467 OBJ_RETAIN(node);
468 proc->node = node;
469 opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
470
471
472
473
474
475
476 OBJ_RETAIN(proc);
477 node->daemon = proc;
478 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
479 node->state = ORTE_NODE_STATE_UP;
480
481 if (orte_retain_aliases) {
482 aliases = NULL;
483 opal_ifgetaliases(&aliases);
484 if (0 < opal_argv_count(aliases)) {
485
486 opal_argv_append_nosize(&aliases, orte_process_info.nodename);
487 aptr = opal_argv_join(aliases, ',');
488 orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, aptr, OPAL_STRING);
489 free(aptr);
490 }
491 opal_argv_free(aliases);
492 }
493
494 jdata->num_procs = 1;
495 jdata->state = ORTE_JOB_STATE_RUNNING;
496
497 jdata->num_reported = 1;
498
499
500
501
502
503
504 if (ORTE_SUCCESS != (ret = orte_plm.init())) {
505 ORTE_ERROR_LOG(ret);
506 error = "orte_plm_init";
507 goto error;
508 }
509
510
511
512
513
514
515 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) {
516 ORTE_ERROR_LOG(ret);
517 error = "orte_ras_base_open";
518 goto error;
519 }
520 if (ORTE_SUCCESS != (ret = orte_ras_base_select())) {
521 ORTE_ERROR_LOG(ret);
522 error = "orte_ras_base_find_available";
523 goto error;
524 }
525 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
526 ORTE_ERROR_LOG(ret);
527 error = "orte_rmaps_base_open";
528 goto error;
529 }
530 if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
531 ORTE_ERROR_LOG(ret);
532 error = "orte_rmaps_base_find_available";
533 goto error;
534 }
535
536
537
538
539
540
541 t = OBJ_NEW(orte_topology_t);
542 t->topo = opal_hwloc_topology;
543
544 orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
545 t->sig = strdup(orte_topo_signature);
546 opal_pointer_array_add(orte_node_topologies, t);
547 node->topology = t;
548 if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
549 opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
550 opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
551 }
552
553
554
555 if (NULL == orte_coprocessors) {
556 orte_coprocessors = OBJ_NEW(opal_hash_table_t);
557 opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
558 }
559
560 coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
561 if (NULL != coprocessors) {
562
563
564
565 sns = opal_argv_split(coprocessors, ',');
566 for (idx=0; NULL != sns[idx]; idx++) {
567
568 OPAL_HASH_STR(sns[idx], h);
569
570 opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid));
571 }
572 opal_argv_free(sns);
573 free(coprocessors);
574 orte_coprocessors_detected = true;
575 }
576
577 coprocessors = opal_hwloc_base_check_on_coprocessor();
578 if (NULL != coprocessors) {
579
580 OPAL_HASH_STR(coprocessors, h);
581
582 opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid));
583 orte_set_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING);
584 free(coprocessors);
585 orte_coprocessors_detected = true;
586 }
587
588
589 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
590 ORTE_ERROR_LOG(ret);
591 error = "orte_odls_base_open";
592 goto error;
593 }
594 if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
595 ORTE_ERROR_LOG(ret);
596 error = "orte_odls_base_select";
597 goto error;
598 }
599
600 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) {
601 ORTE_ERROR_LOG(ret);
602 error = "orte_rtc_base_open";
603 goto error;
604 }
605 if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) {
606 ORTE_ERROR_LOG(ret);
607 error = "orte_rtc_base_select";
608 goto error;
609 }
610
611
612 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP,
613 ORTE_RML_PERSISTENT, orte_show_help_recv, NULL);
614
615 if (orte_create_session_dirs) {
616
617
618 opal_output_set_output_file_info(orte_process_info.proc_session_dir,
619 "output-", NULL, NULL);
620
621 if( NULL == orte_process_info.jobfam_session_dir ){
622
623 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
624 goto error;
625 }
626 contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir, "contact.txt", NULL);
627 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
628 "%s writing contact file %s",
629 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
630 contact_path));
631
632 if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) {
633 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
634 "%s writing contact file failed with error %s",
635 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
636 ORTE_ERROR_NAME(ret)));
637 } else {
638 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
639 "%s wrote contact file",
640 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
641 }
642 free(contact_path);
643 }
644
645
646 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
647 ORTE_ERROR_LOG(ret);
648 error = "orte_iof_base_open";
649 goto error;
650 }
651 if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
652 ORTE_ERROR_LOG(ret);
653 error = "orte_iof_base_select";
654 goto error;
655 }
656
657 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
658 ORTE_ERROR_LOG(ret);
659 error = "orte_filem_base_open";
660 goto error;
661 }
662 if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
663 ORTE_ERROR_LOG(ret);
664 error = "orte_filem_base_select";
665 goto error;
666 }
667
668
669 orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS,
670 orte_debugger_init_after_spawn,
671 ORTE_SYS_PRI);
672 orte_state.add_job_state(ORTE_JOB_STATE_DEBUGGER_DETACH,
673 orte_debugger_detached,
674 ORTE_SYS_PRI);
675
676
677
678
679 if (orte_report_events) {
680 if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) {
681 error = "could not connect to tool";
682 goto error;
683 }
684 }
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702 opal_progress_set_yield_when_idle(false);
703 return ORTE_SUCCESS;
704
705 error:
706 if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
707 orte_show_help("help-orte-runtime.txt",
708 "orte_init:startup:internal-failure",
709 true, error, ORTE_ERROR_NAME(ret), ret);
710 }
711
712 if (NULL != orte_process_info.jobfam_session_dir) {
713 contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
714 "contact.txt", NULL);
715 unlink(contact_path);
716 free(contact_path);
717 }
718
719 orte_session_dir_finalize(ORTE_PROC_MY_NAME);
720
721 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
722 return ORTE_ERR_SILENT;
723 }
724
725 static int rte_finalize(void)
726 {
727 char *contact_path;
728 orte_job_t *jdata;
729 uint32_t key;
730 orte_ess_base_signal_t *sig;
731 unsigned int i;
732
733 if (signals_set) {
734
735 opal_event_signal_del(&epipe_handler);
736
737 opal_event_del(&term_handler);
738
739 i = 0;
740 OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
741 opal_event_signal_del(forward_signals_events + i);
742 ++i;
743 }
744 free (forward_signals_events);
745 forward_signals_events = NULL;
746 signals_set = false;
747 }
748
749
750 pmix_server_finalize();
751 (void) mca_base_framework_close(&opal_pmix_base_framework);
752 (void) mca_base_framework_close(&orte_filem_base_framework);
753
754 fflush(stdout);
755 fflush(stderr);
756
757 (void) mca_base_framework_close(&orte_iof_base_framework);
758 (void) mca_base_framework_close(&orte_rtc_base_framework);
759 (void) mca_base_framework_close(&orte_odls_base_framework);
760 (void) mca_base_framework_close(&orte_rmaps_base_framework);
761 (void) mca_base_framework_close(&orte_ras_base_framework);
762 (void) mca_base_framework_close(&orte_grpcomm_base_framework);
763 (void) mca_base_framework_close(&orte_routed_base_framework);
764 (void) mca_base_framework_close(&orte_plm_base_framework);
765
766
767 orte_errmgr.finalize();
768
769
770 (void) mca_base_framework_close(&opal_pstat_base_framework);
771
772
773 if (NULL != orte_process_info.jobfam_session_dir) {
774 contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
775 "contact.txt", NULL);
776 unlink(contact_path);
777 free(contact_path);
778 }
779
780
781 (void) mca_base_framework_close(&orte_rml_base_framework);
782 (void) mca_base_framework_close(&orte_oob_base_framework);
783 (void) mca_base_framework_close(&orte_errmgr_base_framework);
784 (void) mca_base_framework_close(&orte_state_base_framework);
785
786
787 orte_session_dir_finalize(ORTE_PROC_MY_NAME);
788
789 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
790
791
792 if (orte_xml_output) {
793 fprintf(orte_xml_fp, "</mpirun>\n");
794 fflush(orte_xml_fp);
795 if (stdout != orte_xml_fp) {
796 fclose(orte_xml_fp);
797 }
798 }
799
800
801 OPAL_HASH_TABLE_FOREACH(key, uint32, jdata, orte_job_data) {
802 if (NULL != jdata) {
803 OBJ_RELEASE(jdata);
804 }
805 }
806 OBJ_RELEASE(orte_job_data);
807
808 if (NULL != orte_process_info.super.proc_hostname) {
809 free(orte_process_info.super.proc_hostname);
810 }
811 if (orte_do_not_launch) {
812 exit(0);
813 }
814
815 {
816 opal_pointer_array_t * array = orte_node_topologies;
817 int i;
818 if( array->number_free != array->size ) {
819 OPAL_THREAD_LOCK(&array->lock);
820 array->lowest_free = 0;
821 array->number_free = array->size;
822 for(i=0; i<array->size; i++) {
823 if(NULL != array->addr[i]) {
824 orte_topology_t * topo = (orte_topology_t *)array->addr[i];
825 topo->topo = NULL;
826 OBJ_RELEASE(topo);
827 }
828 array->addr[i] = NULL;
829 }
830 OPAL_THREAD_UNLOCK(&array->lock);
831 }
832 }
833 OBJ_RELEASE(orte_node_topologies);
834
835 {
836 opal_pointer_array_t * array = orte_node_pool;
837 int i;
838 orte_node_t* node = (orte_node_t *)opal_pointer_array_get_item(orte_node_pool, 0);
839 assert(NULL != node);
840 OBJ_RELEASE(node->daemon);
841 node->daemon = NULL;
842 if( array->number_free != array->size ) {
843 OPAL_THREAD_LOCK(&array->lock);
844 array->lowest_free = 0;
845 array->number_free = array->size;
846 for(i=0; i<array->size; i++) {
847 if(NULL != array->addr[i]) {
848 node= (orte_node_t*)array->addr[i];
849 OBJ_RELEASE(node);
850 }
851 array->addr[i] = NULL;
852 }
853 OPAL_THREAD_UNLOCK(&array->lock);
854 }
855 }
856 OBJ_RELEASE(orte_node_pool);
857
858 free(orte_topo_signature);
859
860 return ORTE_SUCCESS;
861 }
862
863 static void rte_abort(int status, bool report)
864 {
865
866
867
868
869
870
871
872
873
874
875 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
876
877
878
879 orte_proc_info_finalize();
880
881 exit(status);
882 }
883
884 static void clean_abort(int fd, short flags, void *arg)
885 {
886
887
888
889 if (opal_atomic_trylock(&orte_abort_inprogress_lock)) {
890 if (forcibly_die) {
891
892 orte_odls.kill_local_procs(NULL);
893
894 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
895
896 opal_pmix.finalize();
897
898 exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
899 }
900 fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orte_basename);
901 forcibly_die = true;
902
903 opal_event_add(&term_handler, NULL);
904 return;
905 }
906
907 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
908
909
910 orte_job_term_ordered = true;
911
912
913
914 orte_execute_quiet = true;
915
916
917
918
919
920 orte_plm.terminate_orteds();;
921 }
922
923 static struct timeval current, last={0,0};
924 static bool first = true;
925
926
927
928
929
930 static void abort_signal_callback(int fd)
931 {
932 uint8_t foo = 1;
933 char *msg = "Abort is in progress...hit ctrl-c again within 5 seconds to forcibly terminate\n\n";
934
935
936
937
938 if (first) {
939 first = false;
940 gettimeofday(¤t, NULL);
941 } else {
942
943 gettimeofday(¤t, NULL);
944
945
946
947
948 if ((current.tv_sec - last.tv_sec) < 5) {
949 exit(1);
950 }
951 write(1, (void*)msg, strlen(msg));
952 }
953
954 last.tv_sec = current.tv_sec;
955
956 write(term_pipe[1], &foo, 1);
957 }
958
959
960
961
962 static int sigpipe_error_count=0;
963 static void epipe_signal_callback(int fd, short flags, void *arg)
964 {
965 sigpipe_error_count++;
966
967 if (10 < sigpipe_error_count) {
968
969 opal_output(0, "%s: SIGPIPE detected on fd %d - aborting", orte_basename, fd);
970 clean_abort(0, 0, NULL);
971 }
972
973 return;
974 }
975
976
977
978
979 static void signal_forward_callback(int fd, short event, void *arg)
980 {
981 opal_event_t *signal = (opal_event_t*)arg;
982 int signum, ret;
983
984 signum = OPAL_EVENT_SIGNAL(signal);
985 if (!orte_execute_quiet){
986 fprintf(stderr, "%s: Forwarding signal %d to job\n",
987 orte_basename, signum);
988 }
989
990
991 if (ORTE_SUCCESS != (ret = orte_plm.signal_job(ORTE_JOBID_WILDCARD, signum))) {
992 fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
993 signum, ret);
994 }
995 }