This source file includes following definitions.
- init
- finalize
- wakeup
- orted_abort
- job_errors
- proc_errors
- any_live_children
- pack_state_for_proc
- pack_state_update
- failed_start
- killprocs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 #include "orte_config.h"
21
22 #include <sys/types.h>
23 #ifdef HAVE_UNISTD_H
24 #include <unistd.h>
25 #endif
26 #include <string.h>
27
28 #include "opal/util/output.h"
29 #include "opal/dss/dss.h"
30
31 #include "orte/util/error_strings.h"
32 #include "orte/util/name_fns.h"
33 #include "orte/util/proc_info.h"
34 #include "orte/util/session_dir.h"
35 #include "orte/util/show_help.h"
36 #include "orte/util/threads.h"
37
38 #include "orte/mca/iof/base/base.h"
39 #include "orte/mca/rml/rml.h"
40 #include "orte/mca/odls/odls.h"
41 #include "orte/mca/odls/base/base.h"
42 #include "orte/mca/odls/base/odls_private.h"
43 #include "orte/mca/plm/plm_types.h"
44 #include "orte/mca/routed/routed.h"
45 #include "orte/mca/ess/ess.h"
46 #include "orte/mca/state/state.h"
47
48 #include "orte/runtime/orte_wait.h"
49 #include "orte/runtime/orte_quit.h"
50 #include "orte/runtime/orte_globals.h"
51 #include "orte/runtime/data_type_support/orte_dt_support.h"
52
53 #include "orte/mca/errmgr/errmgr.h"
54 #include "orte/mca/errmgr/base/base.h"
55 #include "orte/mca/errmgr/base/errmgr_private.h"
56
57 #include "errmgr_default_orted.h"
58
59
60
61
62 static int init(void);
63 static int finalize(void);
64 static void orted_abort(int error_code, char *fmt, ...);
65
66
67
68
69 orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
70 .init = init,
71 .finalize = finalize,
72 .logfn = orte_errmgr_base_log,
73 .abort = orted_abort,
74 .abort_peers = orte_errmgr_base_abort_peers
75 };
76
77
78 static bool any_live_children(orte_jobid_t job);
79 static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat);
80 static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child);
81 static void failed_start(orte_job_t *jobdat);
82 static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
83
84 static void job_errors(int fd, short args, void *cbdata);
85 static void proc_errors(int fd, short args, void *cbdata);
86
87
88
89
90 static int init(void)
91 {
92
93 orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
94
95
96
97
98 orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
99
100
101 orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
102
103 return ORTE_SUCCESS;
104 }
105
106 static int finalize(void)
107 {
108 return ORTE_SUCCESS;
109 }
110
111 static void wakeup(int sd, short args, void *cbdata)
112 {
113
114 ORTE_ACQUIRE_OBJECT(cbdata);
115 orte_quit(0, 0, NULL);
116 }
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131 static void orted_abort(int error_code, char *fmt, ...)
132 {
133 va_list arglist;
134 char *outmsg = NULL;
135 orte_plm_cmd_flag_t cmd;
136 opal_buffer_t *alert;
137 orte_vpid_t null=ORTE_VPID_INVALID;
138 orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
139 orte_timer_t *timer;
140 int rc;
141
142
143 if (orte_abnormal_term_ordered) {
144 return;
145 }
146
147
148 orte_abnormal_term_ordered = true;
149
150
151 va_start(arglist, fmt);
152 if (NULL != fmt) {
153 opal_vasprintf(&outmsg, fmt, arglist);
154 }
155 va_end(arglist);
156
157
158 orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
159
160
161 alert = OBJ_NEW(opal_buffer_t);
162
163 cmd = ORTE_PLM_UPDATE_PROC_STATE;
164 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
165 ORTE_ERROR_LOG(rc);
166 OBJ_RELEASE(alert);
167 goto cleanup;
168 }
169
170 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
171 ORTE_ERROR_LOG(rc);
172 OBJ_RELEASE(alert);
173 goto cleanup;
174 }
175
176 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
177 ORTE_ERROR_LOG(rc);
178 OBJ_RELEASE(alert);
179 goto cleanup;
180 }
181
182 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
183 ORTE_ERROR_LOG(rc);
184 OBJ_RELEASE(alert);
185 goto cleanup;
186 }
187
188 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
189 ORTE_ERROR_LOG(rc);
190 OBJ_RELEASE(alert);
191 goto cleanup;
192 }
193
194 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
195 ORTE_ERROR_LOG(rc);
196 OBJ_RELEASE(alert);
197 goto cleanup;
198 }
199
200 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
201 ORTE_ERROR_LOG(rc);
202 OBJ_RELEASE(alert);
203 goto cleanup;
204 }
205
206
207 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
208 ORTE_RML_TAG_PLM,
209 orte_rml_send_callback, NULL))) {
210 ORTE_ERROR_LOG(rc);
211 OBJ_RELEASE(alert);
212
213 orte_quit(0, 0, NULL);
214 return;
215 }
216
217 cleanup:
218
219
220 if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
221 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
222 return;
223 }
224 timer->tv.tv_sec = 5;
225 timer->tv.tv_usec = 0;
226 opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
227 opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
228 ORTE_POST_OBJECT(timer);
229 opal_event_evtimer_add(timer->ev, &timer->tv);
230
231 }
232
233 static void job_errors(int fd, short args, void *cbdata)
234 {
235 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
236 orte_job_t *jdata;
237 orte_job_state_t jobstate;
238 int rc;
239 orte_plm_cmd_flag_t cmd;
240 opal_buffer_t *alert;
241
242 ORTE_ACQUIRE_OBJECT(caddy);
243
244
245
246
247 if (orte_finalizing) {
248 return;
249 }
250
251
252
253
254 if (NULL == caddy->jdata) {
255 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
256 OBJ_RELEASE(caddy);
257 return;
258 }
259
260
261 jdata = caddy->jdata;
262 jobstate = caddy->job_state;
263 jdata->state = jobstate;
264
265 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
266 "%s errmgr:default_orted: job %s reported error state %s",
267 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
268 ORTE_JOBID_PRINT(jdata->jobid),
269 orte_job_state_to_str(jobstate)));
270
271 switch (jobstate) {
272 case ORTE_JOB_STATE_FAILED_TO_START:
273 failed_start(jdata);
274 break;
275 case ORTE_JOB_STATE_COMM_FAILED:
276
277 killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
278
279 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
280 goto cleanup;
281 break;
282 case ORTE_JOB_STATE_HEARTBEAT_FAILED:
283
284 goto cleanup;
285 break;
286
287 default:
288 break;
289 }
290 alert = OBJ_NEW(opal_buffer_t);
291
292 cmd = ORTE_PLM_UPDATE_PROC_STATE;
293 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
294 ORTE_ERROR_LOG(rc);
295 OBJ_RELEASE(alert);
296 goto cleanup;
297 }
298
299 if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
300 ORTE_ERROR_LOG(rc);
301 OBJ_RELEASE(alert);
302 goto cleanup;
303 }
304
305 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
306 ORTE_RML_TAG_PLM,
307 orte_rml_send_callback, NULL))) {
308 ORTE_ERROR_LOG(rc);
309 OBJ_RELEASE(alert);
310 }
311
312 cleanup:
313 OBJ_RELEASE(caddy);
314 }
315
316 static void proc_errors(int fd, short args, void *cbdata)
317 {
318 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
319 orte_job_t *jdata;
320 orte_process_name_t *proc = &caddy->name;
321 orte_proc_state_t state = caddy->proc_state;
322 orte_proc_t *child, *ptr;
323 opal_buffer_t *alert;
324 orte_plm_cmd_flag_t cmd;
325 int rc=ORTE_SUCCESS;
326 int i;
327 orte_wait_tracker_t *t2;
328
329 ORTE_ACQUIRE_OBJECT(caddy);
330
331 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
332 "%s errmgr:default_orted:proc_errors process %s error state %s",
333 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
334 ORTE_NAME_PRINT(proc),
335 orte_proc_state_to_str(state)));
336
337
338
339
340 if (orte_finalizing) {
341 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
342 "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
343 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
344 goto cleanup;
345 }
346
347
348 if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
349 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
350 "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
351 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
352 goto cleanup;
353 }
354
355
356
357
358 if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
359 ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state ||
360 ORTE_PROC_STATE_NO_PATH_TO_TARGET == state ||
361 ORTE_PROC_STATE_PEER_UNKNOWN == state ||
362 ORTE_PROC_STATE_FAILED_TO_CONNECT == state) {
363 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
364 "%s errmgr:orted lifeline lost or unable to communicate - exiting",
365 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
366
367 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
368
369 killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
370
371
372
373 orte_quit(0, 0, NULL);
374 goto cleanup;
375 }
376
377
378 if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
379
380 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
381 "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
382 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
383 goto cleanup;
384 }
385
386 if (ORTE_PROC_STATE_COMM_FAILED == state) {
387
388 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
389 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
390 "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
391 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
392 goto cleanup;
393 }
394
395 if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
396
397
398
399
400 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
401 "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
402 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
403
404 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
405 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
406 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
407 goto cleanup;
408 }
409
410 t2 = OBJ_NEW(orte_wait_tracker_t);
411 OBJ_RETAIN(child);
412 t2->child = child;
413 t2->evb = orte_event_base;
414 opal_event_set(t2->evb, &t2->ev, -1,
415 OPAL_EV_WRITE, orte_odls_base_default_wait_local_proc, t2);
416 opal_event_set_priority(&t2->ev, ORTE_MSG_PRI);
417 opal_event_active(&t2->ev, OPAL_EV_WRITE, 1);
418 goto cleanup;
419 }
420 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
421 "%s errmgr:default:orted daemon %s exited",
422 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
423 ORTE_NAME_PRINT(proc)));
424
425 if (orte_orteds_term_ordered) {
426
427 for (i=0; i < orte_local_children->size; i++) {
428 if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
429 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
430 OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
431 "%s errmgr:default:orted[%s(%d)] proc %s is alive",
432 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
433 __FILE__, __LINE__,
434 ORTE_NAME_PRINT(&child->name)));
435 goto cleanup;
436 }
437 }
438 }
439
440
441 if (0 == orte_routed.num_routes()) {
442 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
443 "%s errmgr:default:orted all routes gone - exiting",
444 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
445 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
446 } else {
447 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
448 "%s errmgr:default:orted not exiting, num_routes() == %d",
449 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
450 (int)orte_routed.num_routes()));
451 }
452 }
453
454 goto cleanup;
455 }
456
457 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
458 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
459 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
460 goto cleanup;
461 }
462
463
464
465 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
466 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
467 "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
468 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
469 goto cleanup;
470 }
471
472 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
473 "%s errmgr:default_orted got state %s for proc %s",
474 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
475 orte_proc_state_to_str(state),
476 ORTE_NAME_PRINT(proc)));
477
478 if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
479
480 child->state = state;
481
482
483 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
484 alert = OBJ_NEW(opal_buffer_t);
485
486 cmd = ORTE_PLM_UPDATE_PROC_STATE;
487 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
488 ORTE_ERROR_LOG(rc);
489 return;
490 }
491
492
493
494 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
495 ORTE_ERROR_LOG(rc);
496 return;
497 }
498
499
500 if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
501 ORTE_ERROR_LOG(rc);
502 return;
503 }
504
505 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
506 "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
507 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
508 ORTE_NAME_PRINT(&child->name),
509 jdata->num_local_procs));
510 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
511 ORTE_RML_TAG_PLM,
512 orte_rml_send_callback, NULL))) {
513 ORTE_ERROR_LOG(rc);
514 OBJ_RELEASE(alert);
515 }
516
517 orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
518 }
519
520 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
521 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
522 !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
523 ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
524 }
525 goto cleanup;
526 }
527
528 if (ORTE_PROC_STATE_FAILED_TO_START == state ||
529 ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
530
531 child->state = state;
532
533 jdata->num_terminated++;
534
535
536
537
538
539
540 if (jdata->num_local_procs == jdata->num_terminated) {
541
542 if (ORTE_PROC_STATE_FAILED_TO_START == state) {
543 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
544 } else {
545 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
546 }
547 }
548 goto cleanup;
549 }
550
551 if (ORTE_PROC_STATE_TERMINATED < state) {
552
553
554
555 if (orte_orteds_term_ordered) {
556
557
558
559 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
560 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
561 }
562 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
563 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
564 jdata->num_terminated++;
565 }
566 for (i=0; i < orte_local_children->size; i++) {
567 if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
568 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
569 goto keep_going;
570 }
571 }
572 }
573
574
575 if (0 == orte_routed.num_routes()) {
576 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
577 "%s errmgr:default:orted all routes gone - exiting",
578 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
579 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
580 }
581
582 goto cleanup;
583 }
584
585 keep_going:
586
587
588
589
590 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
591 alert = OBJ_NEW(opal_buffer_t);
592
593 cmd = ORTE_PLM_UPDATE_PROC_STATE;
594 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
595 ORTE_ERROR_LOG(rc);
596 return;
597 }
598
599
600
601 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
602 ORTE_ERROR_LOG(rc);
603 return;
604 }
605 child->state = state;
606
607 if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
608 ORTE_ERROR_LOG(rc);
609 return;
610 }
611 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
612 "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
613 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
614 ORTE_NAME_PRINT(&child->name),
615 jdata->num_local_procs));
616
617 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
618 ORTE_RML_TAG_PLM,
619 orte_rml_send_callback, NULL))) {
620 ORTE_ERROR_LOG(rc);
621 }
622
623 orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
624 }
625
626 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
627 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
628 !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
629 ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
630 }
631 goto cleanup;
632 }
633
634
635 if (!any_live_children(proc->jobid)) {
636 alert = OBJ_NEW(opal_buffer_t);
637
638 cmd = ORTE_PLM_UPDATE_PROC_STATE;
639 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
640 ORTE_ERROR_LOG(rc);
641 return;
642 }
643
644 if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
645 ORTE_ERROR_LOG(rc);
646 return;
647 }
648
649 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
650 "%s errmgr:default_orted reporting all procs in %s terminated",
651 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
652 ORTE_JOBID_PRINT(jdata->jobid)));
653
654
655 for (i=0; i < orte_local_children->size; i++) {
656 if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
657 continue;
658 }
659 if (jdata->jobid == ptr->name.jobid) {
660 opal_pointer_array_set_item(orte_local_children, i, NULL);
661 OBJ_RELEASE(ptr);
662 }
663 }
664
665
666 orte_session_dir_cleanup(jdata->jobid);
667
668
669 OBJ_RELEASE(jdata);
670
671
672 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
673 ORTE_RML_TAG_PLM,
674 orte_rml_send_callback, NULL))) {
675 ORTE_ERROR_LOG(rc);
676 }
677 return;
678 }
679
680 cleanup:
681 OBJ_RELEASE(caddy);
682 }
683
684
685
686
687 static bool any_live_children(orte_jobid_t job)
688 {
689 int i;
690 orte_proc_t *child;
691
692 for (i=0; i < orte_local_children->size; i++) {
693 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
694 continue;
695 }
696
697 if ((job == child->name.jobid || ORTE_JOBID_WILDCARD == job) &&
698 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
699 return true;
700 }
701 }
702
703
704 return false;
705
706 }
707
708 static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child)
709 {
710 int rc;
711
712
713 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
714 ORTE_ERROR_LOG(rc);
715 return rc;
716 }
717
718 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
719 ORTE_ERROR_LOG(rc);
720 return rc;
721 }
722
723 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
724 ORTE_ERROR_LOG(rc);
725 return rc;
726 }
727
728 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
729 ORTE_ERROR_LOG(rc);
730 return rc;
731 }
732
733 return ORTE_SUCCESS;
734 }
735
736 static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat)
737 {
738 int rc, i;
739 orte_proc_t *child;
740 orte_vpid_t null=ORTE_VPID_INVALID;
741
742
743 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
744 ORTE_ERROR_LOG(rc);
745 return rc;
746 }
747 for (i=0; i < orte_local_children->size; i++) {
748 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
749 continue;
750 }
751
752 if (child->name.jobid == jobdat->jobid) {
753 if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
754 ORTE_ERROR_LOG(rc);
755 return rc;
756 }
757 }
758 }
759
760 if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
761 ORTE_ERROR_LOG(rc);
762 return rc;
763 }
764
765 return ORTE_SUCCESS;
766 }
767
768 static void failed_start(orte_job_t *jobdat)
769 {
770 int i;
771 orte_proc_t *child;
772
773
774 jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
775
776 for (i=0; i < orte_local_children->size; i++) {
777 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
778 continue;
779 }
780
781 if (child->name.jobid == jobdat->jobid) {
782 if (ORTE_PROC_STATE_FAILED_TO_START == child->state) {
783
784
785
786
787 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
788
789 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
790 }
791 }
792 }
793 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
794 "%s errmgr:hnp: job %s reported incomplete start",
795 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
796 ORTE_JOBID_PRINT(jobdat->jobid)));
797 return;
798 }
799
800 static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
801 {
802 opal_pointer_array_t cmd;
803 orte_proc_t proc;
804 int rc;
805
806 if (ORTE_JOBID_WILDCARD == job
807 && ORTE_VPID_WILDCARD == vpid) {
808 if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
809 ORTE_ERROR_LOG(rc);
810 }
811 return;
812 }
813
814 OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
815 OBJ_CONSTRUCT(&proc, orte_proc_t);
816 proc.name.jobid = job;
817 proc.name.vpid = vpid;
818 opal_pointer_array_add(&cmd, &proc);
819 if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
820 ORTE_ERROR_LOG(rc);
821 }
822 OBJ_DESTRUCT(&cmd);
823 OBJ_DESTRUCT(&proc);
824 }