This source file includes following definitions.
- init
- finalize
- wakeup
- hnp_abort
- job_errors
- proc_errors
- default_hnp_abort
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 #include "orte_config.h"
  24 
  25 #include <sys/types.h>
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif  
  29 #include <string.h>
  30 #ifdef HAVE_SYS_WAIT_H
  31 #include <sys/wait.h>
  32 #endif
  33 
  34 #include "opal/util/output.h"
  35 #include "opal/dss/dss.h"
  36 
  37 #include "orte/mca/iof/base/base.h"
  38 #include "orte/mca/rml/rml.h"
  39 #include "orte/mca/odls/odls.h"
  40 #include "orte/mca/odls/base/base.h"
  41 #include "orte/mca/odls/base/odls_private.h"
  42 #include "orte/mca/plm/base/plm_private.h"
  43 #include "orte/mca/plm/plm.h"
  44 #include "orte/mca/rmaps/rmaps_types.h"
  45 #include "orte/mca/routed/routed.h"
  46 #include "orte/mca/grpcomm/grpcomm.h"
  47 #include "orte/mca/ess/ess.h"
  48 #include "orte/mca/state/state.h"
  49 
  50 #include "orte/util/error_strings.h"
  51 #include "orte/util/name_fns.h"
  52 #include "orte/util/proc_info.h"
  53 #include "orte/util/show_help.h"
  54 #include "orte/util/threads.h"
  55 
  56 #include "orte/runtime/orte_globals.h"
  57 #include "orte/runtime/orte_locks.h"
  58 #include "orte/runtime/orte_quit.h"
  59 #include "orte/runtime/data_type_support/orte_dt_support.h"
  60 
  61 #include "orte/mca/errmgr/errmgr.h"
  62 #include "orte/mca/errmgr/base/base.h"
  63 #include "orte/mca/errmgr/base/errmgr_private.h"
  64 
  65 #include "errmgr_default_hnp.h"
  66 
  67 static int init(void);
  68 static int finalize(void);
  69 static void hnp_abort(int error_code, char *fmt, ...);
  70 
  71 
  72 
  73 
  74 orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
  75     .init = init,
  76     .finalize = finalize,
  77     .logfn = orte_errmgr_base_log,
  78     .abort = hnp_abort,
  79     .abort_peers = orte_errmgr_base_abort_peers
  80 };
  81 
  82 
  83 
  84 
  85 
  86 static void default_hnp_abort(orte_job_t *jdata);
  87 static void job_errors(int fd, short args, void *cbdata);
  88 static void proc_errors(int fd, short args, void *cbdata);
  89 
  90 
  91 
  92 
  93 static int init(void)
  94 {
  95     
  96     orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
  97 
  98     
  99 
 100 
 101     orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
 102 
 103     
 104     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
 105 
 106     return ORTE_SUCCESS;
 107 }
 108 
 109 static int finalize(void)
 110 {
 111     return ORTE_SUCCESS;
 112 }
 113 
 114 static void wakeup(int sd, short args, void *cbdata)
 115 {
 116     
 117     ORTE_ACQUIRE_OBJECT(cbdata);
 118     orte_quit(0, 0, NULL);
 119 }
 120 
 121 
 122 
 123 
 124 
 125 
 126 
 127 
 128 
 129 
 130 
 131 
 132 
 133 
 134 static void hnp_abort(int error_code, char *fmt, ...)
 135 {
 136     va_list arglist;
 137     char *outmsg = NULL;
 138     orte_timer_t *timer;
 139 
 140     
 141     if (orte_abnormal_term_ordered) {
 142         return;
 143     }
 144 
 145     
 146     ORTE_UPDATE_EXIT_STATUS(error_code);
 147 
 148     
 149     orte_abnormal_term_ordered = true;
 150 
 151     
 152     va_start(arglist, fmt);
 153     if (NULL != fmt) {
 154         opal_vasprintf(&outmsg, fmt, arglist);
 155     }
 156     va_end(arglist);
 157 
 158     
 159     orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
 160 
 161     
 162 
 163     if (orte_never_launched) {
 164         orte_quit(0, 0, NULL);
 165         return;
 166     }
 167 
 168     
 169     if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
 170         orte_quit(0, 0, NULL);
 171         return;
 172     }
 173 
 174     
 175 
 176     if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
 177         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 178         return;
 179     }
 180     timer->tv.tv_sec = 5;
 181     timer->tv.tv_usec = 0;
 182     opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
 183     opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
 184     ORTE_POST_OBJECT(timer);
 185     opal_event_evtimer_add(timer->ev, &timer->tv);
 186 }
 187 
 188 
 189 static void job_errors(int fd, short args, void *cbdata)
 190 {
 191     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 192     orte_job_t *jdata;
 193     orte_job_state_t jobstate;
 194     orte_exit_code_t sts;
 195     orte_proc_t *aborted_proc;
 196     opal_buffer_t *answer;
 197     int32_t rc, ret;
 198     int room, *rmptr;
 199 
 200     ORTE_ACQUIRE_OBJECT(caddy);
 201 
 202     
 203 
 204 
 205     if (orte_finalizing) {
 206         return;
 207     }
 208 
 209     
 210     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 211 
 212     
 213 
 214 
 215     if (NULL == caddy->jdata) {
 216         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
 217         OBJ_RELEASE(caddy);
 218         return;
 219     }
 220 
 221     
 222     jdata = caddy->jdata;
 223     jobstate = caddy->job_state;
 224     jdata->state = jobstate;
 225 
 226     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 227                          "%s errmgr:default_hnp: job %s reported state %s",
 228                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 229                          ORTE_JOBID_PRINT(jdata->jobid),
 230                          orte_job_state_to_str(jobstate)));
 231 
 232     if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
 233         ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
 234         ORTE_JOB_STATE_MAP_FAILED == jobstate ||
 235         ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
 236         if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
 237             
 238             orte_never_launched = true;
 239         }
 240         
 241 
 242 
 243 
 244 
 245         orte_routing_is_enabled = false;
 246         jdata->num_terminated = jdata->num_procs;
 247         
 248         ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
 249         
 250         if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
 251             rc = jobstate;
 252             answer = OBJ_NEW(opal_buffer_t);
 253             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
 254                 ORTE_ERROR_LOG(ret);
 255                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 256                 OBJ_RELEASE(caddy);
 257                 return;
 258             }
 259             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
 260                 ORTE_ERROR_LOG(ret);
 261                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 262                 OBJ_RELEASE(caddy);
 263                 return;
 264             }
 265             
 266             rmptr = &room;
 267             if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
 268                 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
 269                     ORTE_ERROR_LOG(ret);
 270                     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 271                     OBJ_RELEASE(caddy);
 272                     return;
 273                 }
 274             }
 275             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 276                                  "%s errmgr:hnp sending dyn error release of job %s to %s",
 277                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 278                                  ORTE_JOBID_PRINT(jdata->jobid),
 279                                  ORTE_NAME_PRINT(&jdata->originator)));
 280             if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
 281                                                    ORTE_RML_TAG_LAUNCH_RESP,
 282                                                    orte_rml_send_callback, NULL))) {
 283                 ORTE_ERROR_LOG(ret);
 284                 OBJ_RELEASE(answer);
 285                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 286             }
 287         }
 288         OBJ_RELEASE(caddy);
 289         return;
 290     }
 291 
 292     if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
 293         ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
 294         
 295 
 296 
 297 
 298         aborted_proc = NULL;
 299         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
 300             sts = aborted_proc->exit_code;
 301             if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
 302                 if (WIFSIGNALED(sts)) { 
 303 #ifdef WCOREDUMP
 304                     if (WCOREDUMP(sts)) {
 305                         orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
 306                                        WTERMSIG(sts));
 307                         sts = WTERMSIG(sts);
 308                     } else {
 309                         orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
 310                                        WTERMSIG(sts));
 311                         sts = WTERMSIG(sts);
 312                     }
 313 #else
 314                     orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
 315                                    WTERMSIG(sts));
 316                     sts = WTERMSIG(sts);
 317 #endif 
 318                 } else {
 319                     orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
 320                                    WEXITSTATUS(sts));
 321                     sts = WEXITSTATUS(sts);
 322                 }
 323             }
 324         }
 325         
 326 
 327 
 328         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
 329             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
 330         }
 331     }
 332 
 333     
 334 
 335 
 336 
 337 
 338     if (ORTE_JOB_STATE_ABORTED == jobstate &&
 339         jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
 340         jdata->num_procs != jdata->num_reported) {
 341         orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
 342     }
 343 
 344     
 345     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
 346     
 347     orte_abnormal_term_ordered = true;
 348     OBJ_RELEASE(caddy);
 349 }
 350 
 351 static void proc_errors(int fd, short args, void *cbdata)
 352 {
 353     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 354     orte_job_t *jdata;
 355     orte_proc_t *pptr, *proct;
 356     orte_process_name_t *proc = &caddy->name;
 357     orte_proc_state_t state = caddy->proc_state;
 358     int i;
 359     int32_t i32, *i32ptr;
 360 
 361     ORTE_ACQUIRE_OBJECT(caddy);
 362 
 363     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 364                          "%s errmgr:default_hnp: for proc %s state %s",
 365                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 366                          ORTE_NAME_PRINT(proc),
 367                          orte_proc_state_to_str(state)));
 368 
 369     
 370 
 371 
 372     if (orte_finalizing) {
 373         goto cleanup;
 374     }
 375 
 376     
 377     if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
 378         
 379         goto cleanup;
 380     }
 381     pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
 382 
 383     
 384 
 385 
 386 
 387     if (ORTE_PROC_STATE_COMM_FAILED == state) {
 388         
 389         if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
 390             
 391             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 392                                  "%s Comm failure to non-daemon proc - ignoring it",
 393                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 394             goto cleanup;
 395         }
 396         
 397         if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
 398             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 399                                  "%s Comm failure on my own connection - ignoring it",
 400                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 401             goto cleanup;
 402         }
 403         
 404         ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
 405         
 406 
 407         if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
 408             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 409                                  "%s Comm failure: daemons terminating - recording daemon %s as gone",
 410                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
 411             
 412             orte_routed.route_lost(proc);
 413             
 414             if (0 == orte_routed.num_routes()) {
 415                 for (i=0; i < orte_local_children->size; i++) {
 416                     if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
 417                         ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
 418                         
 419                         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 420                                              "%s Comm failure: at least one proc (%s) still alive",
 421                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 422                                              ORTE_NAME_PRINT(&proct->name)));
 423                         goto cleanup;
 424                     }
 425                 }
 426                 
 427                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 428                                      "%s errmgr_hnp: all routes and children gone - ordering exit",
 429                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 430                 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 431             } else {
 432                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 433                                      "%s Comm failure: %d routes remain alive",
 434                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 435                                      (int)orte_routed.num_routes()));
 436             }
 437             goto cleanup;
 438         }
 439         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 440                              "%s Comm failure: daemon %s - aborting",
 441                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
 442         
 443         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 444             
 445             jdata->state = ORTE_JOB_STATE_COMM_FAILED;
 446             
 447             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 448             
 449             OBJ_RETAIN(pptr);
 450             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 451             if (!orte_enable_recovery) {
 452                 
 453                 orte_show_help("help-errmgr-base.txt", "node-died", true,
 454                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 455                                orte_process_info.nodename,
 456                                ORTE_NAME_PRINT(proc),
 457                                pptr->node->name);
 458                 
 459                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 460                 
 461 
 462                 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
 463             }
 464         }
 465         
 466 
 467         if (!orte_enable_recovery) {
 468             default_hnp_abort(jdata);
 469         }
 470         goto cleanup;
 471     }
 472 
 473     
 474 
 475 
 476     if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
 477         pptr->state = state;
 478     }
 479 
 480     
 481 
 482 
 483     if (orte_orteds_term_ordered) {
 484         for (i=0; i < orte_local_children->size; i++) {
 485             if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 486                 if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
 487                     goto keep_going;
 488                 }
 489             }
 490         }
 491         
 492 
 493         if (0 == orte_routed.num_routes()) {
 494             OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 495                                  "%s errmgr:default:hnp all routes gone - exiting",
 496                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 497             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 498         }
 499     }
 500 
 501   keep_going:
 502     
 503 
 504     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
 505         ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
 506         
 507         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
 508         
 509 
 510 
 511 
 512         if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
 513             ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_IOF_COMPLETE);
 514         }
 515         goto cleanup;
 516     }
 517 
 518     
 519 
 520 
 521     switch (state) {
 522     case ORTE_PROC_STATE_KILLED_BY_CMD:
 523         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 524                              "%s errmgr:hnp: proc %s killed by cmd",
 525                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 526                              ORTE_NAME_PRINT(proc)));
 527         
 528 
 529 
 530         if (jdata->num_terminated >= jdata->num_procs) {
 531             
 532             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 533         }
 534         
 535         break;
 536 
 537     case ORTE_PROC_STATE_ABORTED:
 538         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 539                              "%s errmgr:hnp: proc %s aborted",
 540                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 541                              ORTE_NAME_PRINT(proc)));
 542         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 543             jdata->state = ORTE_JOB_STATE_ABORTED;
 544             
 545             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 546             
 547             OBJ_RETAIN(pptr);
 548             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 549             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 550             
 551 
 552             default_hnp_abort(jdata);
 553         }
 554         break;
 555 
 556     case ORTE_PROC_STATE_ABORTED_BY_SIG:
 557         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 558                              "%s errmgr:hnp: proc %s aborted by signal",
 559                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 560                              ORTE_NAME_PRINT(proc)));
 561 
 562         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 563         
 564         i32 = 0;
 565         i32ptr = &i32;
 566         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
 567         ++i32;
 568         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
 569         if (orte_abort_non_zero_exit) {
 570 
 571             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 572                 jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
 573                 
 574                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 575                 
 576                 OBJ_RETAIN(pptr);
 577                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 578                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 579                 
 580 
 581                 default_hnp_abort(jdata);
 582             }
 583         } else {
 584             
 585             if (jdata->num_terminated >= jdata->num_procs) {
 586                 
 587                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 588             }
 589         }
 590         break;
 591 
 592     case ORTE_PROC_STATE_TERM_WO_SYNC:
 593         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 594                              "%s errmgr:hnp: proc %s terminated without sync",
 595                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 596                              ORTE_NAME_PRINT(proc)));
 597         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 598             jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
 599             
 600             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 601             
 602             OBJ_RETAIN(pptr);
 603             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 604             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 605             
 606 
 607 
 608 
 609 
 610             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 611             
 612 
 613             default_hnp_abort(jdata);
 614         }
 615         break;
 616 
 617     case ORTE_PROC_STATE_FAILED_TO_START:
 618     case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
 619         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 620                              "%s errmgr:hnp: proc %s %s",
 621                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 622                              ORTE_NAME_PRINT(proc),
 623                              orte_proc_state_to_str(state)));
 624         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 625             if (ORTE_PROC_STATE_FAILED_TO_START) {
 626                 jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
 627             } else {
 628                 jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
 629             }
 630             
 631             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 632             
 633             OBJ_RETAIN(pptr);
 634             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 635             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 636             
 637 
 638             default_hnp_abort(jdata);
 639         }
 640         
 641         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
 642             
 643             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
 644         }
 645         break;
 646 
 647     case ORTE_PROC_STATE_CALLED_ABORT:
 648         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 649                              "%s errmgr:hnp: proc %s called abort with exit code %d",
 650                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 651                              ORTE_NAME_PRINT(proc), pptr->exit_code));
 652         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 653             jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
 654             
 655             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 656             
 657             OBJ_RETAIN(pptr);
 658             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 659             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 660             
 661 
 662             default_hnp_abort(jdata);
 663         }
 664         break;
 665 
 666     case ORTE_PROC_STATE_TERM_NON_ZERO:
 667         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 668                              "%s errmgr:hnp: proc %s exited with non-zero status %d",
 669                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 670                              ORTE_NAME_PRINT(proc),
 671                              pptr->exit_code));
 672         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 673         
 674         i32 = 0;
 675         i32ptr = &i32;
 676         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
 677         ++i32;
 678         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
 679         if (orte_abort_non_zero_exit) {
 680             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 681                 jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
 682                 
 683                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 684                 
 685                 OBJ_RETAIN(pptr);
 686                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 687                 
 688 
 689                 default_hnp_abort(jdata);
 690             }
 691         } else {
 692             
 693             if (jdata->num_terminated >= jdata->num_procs) {
 694                 
 695                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 696             }
 697         }
 698         break;
 699 
 700     case ORTE_PROC_STATE_HEARTBEAT_FAILED:
 701         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 702                              "%s errmgr:hnp: proc %s heartbeat failed",
 703                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 704                              ORTE_NAME_PRINT(proc)));
 705         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 706             jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
 707             
 708             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 709             
 710             OBJ_RETAIN(pptr);
 711             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 712             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 713             
 714 
 715             default_hnp_abort(jdata);
 716         }
 717         
 718         orte_routed.route_lost(proc);
 719         break;
 720 
 721     case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
 722         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 723                              "%s errmgr:hnp: unable to send message to proc %s",
 724                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 725                              ORTE_NAME_PRINT(proc)));
 726         
 727 
 728 
 729         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 730             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 731             break;
 732         }
 733         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 734             
 735 
 736             default_hnp_abort(jdata);
 737         }
 738         break;
 739 
 740     case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
 741         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 742                              "%s errmgr:hnp: no message path to proc %s",
 743                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 744                              ORTE_NAME_PRINT(proc)));
 745         orte_show_help("help-errmgr-base.txt", "no-path", true,
 746                        orte_process_info.nodename, pptr->node->name);
 747         
 748 
 749 
 750         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 751             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 752             break;
 753         }
 754         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 755             
 756 
 757             default_hnp_abort(jdata);
 758         }
 759         break;
 760 
 761     case ORTE_PROC_STATE_FAILED_TO_CONNECT:
 762         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 763                              "%s errmgr:hnp: cannot connect to proc %s",
 764                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 765                              ORTE_NAME_PRINT(proc)));
 766         orte_show_help("help-errmgr-base.txt", "no-connect", true,
 767                        orte_process_info.nodename, pptr->node->name);
 768         
 769 
 770 
 771         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 772             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 773             break;
 774         }
 775         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 776             
 777 
 778             default_hnp_abort(jdata);
 779         }
 780         break;
 781 
 782     default:
 783         
 784         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 785                              "%s errmgr:hnp: proc %s default error %s",
 786                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 787                              ORTE_NAME_PRINT(proc),
 788                              orte_proc_state_to_str(state)));
 789         if (jdata->num_terminated == jdata->num_procs) {
 790             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 791         }
 792         break;
 793     }
 794     
 795     if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
 796         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
 797     }
 798 
 799  cleanup:
 800     OBJ_RELEASE(caddy);
 801 }
 802 
 803 
 804 
 805 
 806 static void default_hnp_abort(orte_job_t *jdata)
 807 {
 808     int rc;
 809     int32_t i32, *i32ptr;
 810 
 811     
 812     if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { 
 813         OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 814                              "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
 815                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 816                              ORTE_JOBID_PRINT(jdata->jobid)));
 817         return;
 818     }
 819 
 820     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 821                          "%s errmgr:default_hnp: abort called on job %s",
 822                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 823                          ORTE_JOBID_PRINT(jdata->jobid)));
 824 
 825     
 826     orte_job_term_ordered = true;
 827     orte_enable_recovery = false;
 828 
 829     
 830 
 831 
 832 
 833     if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
 834         orte_abnormal_term_ordered = true;
 835     }
 836 
 837     i32 = 0;
 838     i32ptr = &i32;
 839     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
 840         
 841         orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
 842                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
 843                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
 844                        i32, (1 == i32) ? "process returned\na non-zero exit code" :
 845                        "processes returned\nnon-zero exit codes");
 846     }
 847 
 848     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 849                          "%s errmgr:default_hnp: ordering orted termination",
 850                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 851 
 852     
 853 
 854 
 855     if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
 856         ORTE_ERROR_LOG(rc);
 857     }
 858 }