This source file includes following definitions.
- init
- finalize
- wakeup
- hnp_abort
- job_errors
- proc_errors
- default_hnp_abort
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 #include "orte_config.h"
24
25 #include <sys/types.h>
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #include <string.h>
30 #ifdef HAVE_SYS_WAIT_H
31 #include <sys/wait.h>
32 #endif
33
34 #include "opal/util/output.h"
35 #include "opal/dss/dss.h"
36
37 #include "orte/mca/iof/base/base.h"
38 #include "orte/mca/rml/rml.h"
39 #include "orte/mca/odls/odls.h"
40 #include "orte/mca/odls/base/base.h"
41 #include "orte/mca/odls/base/odls_private.h"
42 #include "orte/mca/plm/base/plm_private.h"
43 #include "orte/mca/plm/plm.h"
44 #include "orte/mca/rmaps/rmaps_types.h"
45 #include "orte/mca/routed/routed.h"
46 #include "orte/mca/grpcomm/grpcomm.h"
47 #include "orte/mca/ess/ess.h"
48 #include "orte/mca/state/state.h"
49
50 #include "orte/util/error_strings.h"
51 #include "orte/util/name_fns.h"
52 #include "orte/util/proc_info.h"
53 #include "orte/util/show_help.h"
54 #include "orte/util/threads.h"
55
56 #include "orte/runtime/orte_globals.h"
57 #include "orte/runtime/orte_locks.h"
58 #include "orte/runtime/orte_quit.h"
59 #include "orte/runtime/data_type_support/orte_dt_support.h"
60
61 #include "orte/mca/errmgr/errmgr.h"
62 #include "orte/mca/errmgr/base/base.h"
63 #include "orte/mca/errmgr/base/errmgr_private.h"
64
65 #include "errmgr_default_hnp.h"
66
67 static int init(void);
68 static int finalize(void);
69 static void hnp_abort(int error_code, char *fmt, ...);
70
71
72
73
74 orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
75 .init = init,
76 .finalize = finalize,
77 .logfn = orte_errmgr_base_log,
78 .abort = hnp_abort,
79 .abort_peers = orte_errmgr_base_abort_peers
80 };
81
82
83
84
85
86 static void default_hnp_abort(orte_job_t *jdata);
87 static void job_errors(int fd, short args, void *cbdata);
88 static void proc_errors(int fd, short args, void *cbdata);
89
90
91
92
93 static int init(void)
94 {
95
96 orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
97
98
99
100
101 orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
102
103
104 orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
105
106 return ORTE_SUCCESS;
107 }
108
109 static int finalize(void)
110 {
111 return ORTE_SUCCESS;
112 }
113
114 static void wakeup(int sd, short args, void *cbdata)
115 {
116
117 ORTE_ACQUIRE_OBJECT(cbdata);
118 orte_quit(0, 0, NULL);
119 }
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134 static void hnp_abort(int error_code, char *fmt, ...)
135 {
136 va_list arglist;
137 char *outmsg = NULL;
138 orte_timer_t *timer;
139
140
141 if (orte_abnormal_term_ordered) {
142 return;
143 }
144
145
146 ORTE_UPDATE_EXIT_STATUS(error_code);
147
148
149 orte_abnormal_term_ordered = true;
150
151
152 va_start(arglist, fmt);
153 if (NULL != fmt) {
154 opal_vasprintf(&outmsg, fmt, arglist);
155 }
156 va_end(arglist);
157
158
159 orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
160
161
162
163 if (orte_never_launched) {
164 orte_quit(0, 0, NULL);
165 return;
166 }
167
168
169 if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
170 orte_quit(0, 0, NULL);
171 return;
172 }
173
174
175
176 if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
177 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
178 return;
179 }
180 timer->tv.tv_sec = 5;
181 timer->tv.tv_usec = 0;
182 opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
183 opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
184 ORTE_POST_OBJECT(timer);
185 opal_event_evtimer_add(timer->ev, &timer->tv);
186 }
187
188
189 static void job_errors(int fd, short args, void *cbdata)
190 {
191 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
192 orte_job_t *jdata;
193 orte_job_state_t jobstate;
194 orte_exit_code_t sts;
195 orte_proc_t *aborted_proc;
196 opal_buffer_t *answer;
197 int32_t rc, ret;
198 int room, *rmptr;
199
200 ORTE_ACQUIRE_OBJECT(caddy);
201
202
203
204
205 if (orte_finalizing) {
206 return;
207 }
208
209
210 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
211
212
213
214
215 if (NULL == caddy->jdata) {
216 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
217 OBJ_RELEASE(caddy);
218 return;
219 }
220
221
222 jdata = caddy->jdata;
223 jobstate = caddy->job_state;
224 jdata->state = jobstate;
225
226 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
227 "%s errmgr:default_hnp: job %s reported state %s",
228 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
229 ORTE_JOBID_PRINT(jdata->jobid),
230 orte_job_state_to_str(jobstate)));
231
232 if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
233 ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
234 ORTE_JOB_STATE_MAP_FAILED == jobstate ||
235 ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
236 if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
237
238 orte_never_launched = true;
239 }
240
241
242
243
244
245 orte_routing_is_enabled = false;
246 jdata->num_terminated = jdata->num_procs;
247
248 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
249
250 if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
251 rc = jobstate;
252 answer = OBJ_NEW(opal_buffer_t);
253 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
254 ORTE_ERROR_LOG(ret);
255 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
256 OBJ_RELEASE(caddy);
257 return;
258 }
259 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
260 ORTE_ERROR_LOG(ret);
261 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
262 OBJ_RELEASE(caddy);
263 return;
264 }
265
266 rmptr = &room;
267 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
268 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
269 ORTE_ERROR_LOG(ret);
270 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
271 OBJ_RELEASE(caddy);
272 return;
273 }
274 }
275 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
276 "%s errmgr:hnp sending dyn error release of job %s to %s",
277 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
278 ORTE_JOBID_PRINT(jdata->jobid),
279 ORTE_NAME_PRINT(&jdata->originator)));
280 if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
281 ORTE_RML_TAG_LAUNCH_RESP,
282 orte_rml_send_callback, NULL))) {
283 ORTE_ERROR_LOG(ret);
284 OBJ_RELEASE(answer);
285 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
286 }
287 }
288 OBJ_RELEASE(caddy);
289 return;
290 }
291
292 if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
293 ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
294
295
296
297
298 aborted_proc = NULL;
299 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
300 sts = aborted_proc->exit_code;
301 if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
302 if (WIFSIGNALED(sts)) {
303 #ifdef WCOREDUMP
304 if (WCOREDUMP(sts)) {
305 orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
306 WTERMSIG(sts));
307 sts = WTERMSIG(sts);
308 } else {
309 orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
310 WTERMSIG(sts));
311 sts = WTERMSIG(sts);
312 }
313 #else
314 orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
315 WTERMSIG(sts));
316 sts = WTERMSIG(sts);
317 #endif
318 } else {
319 orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
320 WEXITSTATUS(sts));
321 sts = WEXITSTATUS(sts);
322 }
323 }
324 }
325
326
327
328 if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
329 orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
330 }
331 }
332
333
334
335
336
337
338 if (ORTE_JOB_STATE_ABORTED == jobstate &&
339 jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
340 jdata->num_procs != jdata->num_reported) {
341 orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
342 }
343
344
345 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
346
347 orte_abnormal_term_ordered = true;
348 OBJ_RELEASE(caddy);
349 }
350
351 static void proc_errors(int fd, short args, void *cbdata)
352 {
353 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
354 orte_job_t *jdata;
355 orte_proc_t *pptr, *proct;
356 orte_process_name_t *proc = &caddy->name;
357 orte_proc_state_t state = caddy->proc_state;
358 int i;
359 int32_t i32, *i32ptr;
360
361 ORTE_ACQUIRE_OBJECT(caddy);
362
363 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
364 "%s errmgr:default_hnp: for proc %s state %s",
365 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
366 ORTE_NAME_PRINT(proc),
367 orte_proc_state_to_str(state)));
368
369
370
371
372 if (orte_finalizing) {
373 goto cleanup;
374 }
375
376
377 if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
378
379 goto cleanup;
380 }
381 pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
382
383
384
385
386
387 if (ORTE_PROC_STATE_COMM_FAILED == state) {
388
389 if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
390
391 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
392 "%s Comm failure to non-daemon proc - ignoring it",
393 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
394 goto cleanup;
395 }
396
397 if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
398 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
399 "%s Comm failure on my own connection - ignoring it",
400 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
401 goto cleanup;
402 }
403
404 ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
405
406
407 if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
408 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
409 "%s Comm failure: daemons terminating - recording daemon %s as gone",
410 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
411
412 orte_routed.route_lost(proc);
413
414 if (0 == orte_routed.num_routes()) {
415 for (i=0; i < orte_local_children->size; i++) {
416 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
417 ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
418
419 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
420 "%s Comm failure: at least one proc (%s) still alive",
421 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
422 ORTE_NAME_PRINT(&proct->name)));
423 goto cleanup;
424 }
425 }
426
427 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
428 "%s errmgr_hnp: all routes and children gone - ordering exit",
429 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
430 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
431 } else {
432 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
433 "%s Comm failure: %d routes remain alive",
434 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
435 (int)orte_routed.num_routes()));
436 }
437 goto cleanup;
438 }
439 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
440 "%s Comm failure: daemon %s - aborting",
441 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
442
443 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
444
445 jdata->state = ORTE_JOB_STATE_COMM_FAILED;
446
447 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
448
449 OBJ_RETAIN(pptr);
450 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
451 if (!orte_enable_recovery) {
452
453 orte_show_help("help-errmgr-base.txt", "node-died", true,
454 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
455 orte_process_info.nodename,
456 ORTE_NAME_PRINT(proc),
457 pptr->node->name);
458
459 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
460
461
462 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
463 }
464 }
465
466
467 if (!orte_enable_recovery) {
468 default_hnp_abort(jdata);
469 }
470 goto cleanup;
471 }
472
473
474
475
476 if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
477 pptr->state = state;
478 }
479
480
481
482
483 if (orte_orteds_term_ordered) {
484 for (i=0; i < orte_local_children->size; i++) {
485 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
486 if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
487 goto keep_going;
488 }
489 }
490 }
491
492
493 if (0 == orte_routed.num_routes()) {
494 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
495 "%s errmgr:default:hnp all routes gone - exiting",
496 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
497 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
498 }
499 }
500
501 keep_going:
502
503
504 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
505 ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
506
507 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
508
509
510
511
512 if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
513 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_IOF_COMPLETE);
514 }
515 goto cleanup;
516 }
517
518
519
520
521 switch (state) {
522 case ORTE_PROC_STATE_KILLED_BY_CMD:
523 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
524 "%s errmgr:hnp: proc %s killed by cmd",
525 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
526 ORTE_NAME_PRINT(proc)));
527
528
529
530 if (jdata->num_terminated >= jdata->num_procs) {
531
532 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
533 }
534
535 break;
536
537 case ORTE_PROC_STATE_ABORTED:
538 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
539 "%s errmgr:hnp: proc %s aborted",
540 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
541 ORTE_NAME_PRINT(proc)));
542 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
543 jdata->state = ORTE_JOB_STATE_ABORTED;
544
545 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
546
547 OBJ_RETAIN(pptr);
548 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
549 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
550
551
552 default_hnp_abort(jdata);
553 }
554 break;
555
556 case ORTE_PROC_STATE_ABORTED_BY_SIG:
557 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
558 "%s errmgr:hnp: proc %s aborted by signal",
559 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
560 ORTE_NAME_PRINT(proc)));
561
562 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
563
564 i32 = 0;
565 i32ptr = &i32;
566 orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
567 ++i32;
568 orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
569 if (orte_abort_non_zero_exit) {
570
571 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
572 jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
573
574 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
575
576 OBJ_RETAIN(pptr);
577 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
578 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
579
580
581 default_hnp_abort(jdata);
582 }
583 } else {
584
585 if (jdata->num_terminated >= jdata->num_procs) {
586
587 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
588 }
589 }
590 break;
591
592 case ORTE_PROC_STATE_TERM_WO_SYNC:
593 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
594 "%s errmgr:hnp: proc %s terminated without sync",
595 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
596 ORTE_NAME_PRINT(proc)));
597 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
598 jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
599
600 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
601
602 OBJ_RETAIN(pptr);
603 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
604 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
605
606
607
608
609
610 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
611
612
613 default_hnp_abort(jdata);
614 }
615 break;
616
617 case ORTE_PROC_STATE_FAILED_TO_START:
618 case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
619 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
620 "%s errmgr:hnp: proc %s %s",
621 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
622 ORTE_NAME_PRINT(proc),
623 orte_proc_state_to_str(state)));
624 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
625 if (ORTE_PROC_STATE_FAILED_TO_START) {
626 jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
627 } else {
628 jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
629 }
630
631 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
632
633 OBJ_RETAIN(pptr);
634 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
635 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
636
637
638 default_hnp_abort(jdata);
639 }
640
641 if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
642
643 orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
644 }
645 break;
646
647 case ORTE_PROC_STATE_CALLED_ABORT:
648 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
649 "%s errmgr:hnp: proc %s called abort with exit code %d",
650 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
651 ORTE_NAME_PRINT(proc), pptr->exit_code));
652 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
653 jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
654
655 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
656
657 OBJ_RETAIN(pptr);
658 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
659 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
660
661
662 default_hnp_abort(jdata);
663 }
664 break;
665
666 case ORTE_PROC_STATE_TERM_NON_ZERO:
667 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
668 "%s errmgr:hnp: proc %s exited with non-zero status %d",
669 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
670 ORTE_NAME_PRINT(proc),
671 pptr->exit_code));
672 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
673
674 i32 = 0;
675 i32ptr = &i32;
676 orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
677 ++i32;
678 orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
679 if (orte_abort_non_zero_exit) {
680 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
681 jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
682
683 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
684
685 OBJ_RETAIN(pptr);
686 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
687
688
689 default_hnp_abort(jdata);
690 }
691 } else {
692
693 if (jdata->num_terminated >= jdata->num_procs) {
694
695 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
696 }
697 }
698 break;
699
700 case ORTE_PROC_STATE_HEARTBEAT_FAILED:
701 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
702 "%s errmgr:hnp: proc %s heartbeat failed",
703 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
704 ORTE_NAME_PRINT(proc)));
705 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
706 jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
707
708 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
709
710 OBJ_RETAIN(pptr);
711 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
712 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
713
714
715 default_hnp_abort(jdata);
716 }
717
718 orte_routed.route_lost(proc);
719 break;
720
721 case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
722 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
723 "%s errmgr:hnp: unable to send message to proc %s",
724 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
725 ORTE_NAME_PRINT(proc)));
726
727
728
729 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
730 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
731 break;
732 }
733 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
734
735
736 default_hnp_abort(jdata);
737 }
738 break;
739
740 case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
741 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
742 "%s errmgr:hnp: no message path to proc %s",
743 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
744 ORTE_NAME_PRINT(proc)));
745 orte_show_help("help-errmgr-base.txt", "no-path", true,
746 orte_process_info.nodename, pptr->node->name);
747
748
749
750 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
751 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
752 break;
753 }
754 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
755
756
757 default_hnp_abort(jdata);
758 }
759 break;
760
761 case ORTE_PROC_STATE_FAILED_TO_CONNECT:
762 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
763 "%s errmgr:hnp: cannot connect to proc %s",
764 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
765 ORTE_NAME_PRINT(proc)));
766 orte_show_help("help-errmgr-base.txt", "no-connect", true,
767 orte_process_info.nodename, pptr->node->name);
768
769
770
771 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
772 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
773 break;
774 }
775 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
776
777
778 default_hnp_abort(jdata);
779 }
780 break;
781
782 default:
783
784 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
785 "%s errmgr:hnp: proc %s default error %s",
786 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
787 ORTE_NAME_PRINT(proc),
788 orte_proc_state_to_str(state)));
789 if (jdata->num_terminated == jdata->num_procs) {
790 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
791 }
792 break;
793 }
794
795 if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
796 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
797 }
798
799 cleanup:
800 OBJ_RELEASE(caddy);
801 }
802
803
804
805
806 static void default_hnp_abort(orte_job_t *jdata)
807 {
808 int rc;
809 int32_t i32, *i32ptr;
810
811
812 if (opal_atomic_trylock(&orte_abort_inprogress_lock)) {
813 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
814 "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
815 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
816 ORTE_JOBID_PRINT(jdata->jobid)));
817 return;
818 }
819
820 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
821 "%s errmgr:default_hnp: abort called on job %s",
822 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
823 ORTE_JOBID_PRINT(jdata->jobid)));
824
825
826 orte_job_term_ordered = true;
827 orte_enable_recovery = false;
828
829
830
831
832
833 if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
834 orte_abnormal_term_ordered = true;
835 }
836
837 i32 = 0;
838 i32ptr = &i32;
839 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
840
841 orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
842 (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
843 (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
844 i32, (1 == i32) ? "process returned\na non-zero exit code" :
845 "processes returned\nnon-zero exit codes");
846 }
847
848 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
849 "%s errmgr:default_hnp: ordering orted termination",
850 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
851
852
853
854
855 if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
856 ORTE_ERROR_LOG(rc);
857 }
858 }