root/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. init
  2. finalize
  3. wakeup
  4. hnp_abort
  5. job_errors
  6. proc_errors
  7. default_hnp_abort

   1 /*
   2  * Copyright (c) 2009-2011 The Trustees of Indiana University.
   3  *                         All rights reserved.
   4  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
   5  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
   6  * Copyright (c) 2004-2011 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2011      Oracle and/or all its affiliates.  All rights reserved.
  10  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  11  *                         All rights reserved.
  12  * Copyright (c) 2014-2019 Intel, Inc.  All rights reserved.
  13  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  14  * Copyright (c) 2018      Research Organization for Information Science
  15  *                         and Technology (RIST).  All rights reserved.
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  */
  22 
  23 #include "orte_config.h"
  24 
  25 #include <sys/types.h>
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif  /* HAVE_UNISTD_H */
  29 #include <string.h>
  30 #ifdef HAVE_SYS_WAIT_H
  31 #include <sys/wait.h>
  32 #endif
  33 
  34 #include "opal/util/output.h"
  35 #include "opal/dss/dss.h"
  36 
  37 #include "orte/mca/iof/base/base.h"
  38 #include "orte/mca/rml/rml.h"
  39 #include "orte/mca/odls/odls.h"
  40 #include "orte/mca/odls/base/base.h"
  41 #include "orte/mca/odls/base/odls_private.h"
  42 #include "orte/mca/plm/base/plm_private.h"
  43 #include "orte/mca/plm/plm.h"
  44 #include "orte/mca/rmaps/rmaps_types.h"
  45 #include "orte/mca/routed/routed.h"
  46 #include "orte/mca/grpcomm/grpcomm.h"
  47 #include "orte/mca/ess/ess.h"
  48 #include "orte/mca/state/state.h"
  49 
  50 #include "orte/util/error_strings.h"
  51 #include "orte/util/name_fns.h"
  52 #include "orte/util/proc_info.h"
  53 #include "orte/util/show_help.h"
  54 #include "orte/util/threads.h"
  55 
  56 #include "orte/runtime/orte_globals.h"
  57 #include "orte/runtime/orte_locks.h"
  58 #include "orte/runtime/orte_quit.h"
  59 #include "orte/runtime/data_type_support/orte_dt_support.h"
  60 
  61 #include "orte/mca/errmgr/errmgr.h"
  62 #include "orte/mca/errmgr/base/base.h"
  63 #include "orte/mca/errmgr/base/errmgr_private.h"
  64 
  65 #include "errmgr_default_hnp.h"
  66 
  67 static int init(void);
  68 static int finalize(void);
  69 static void hnp_abort(int error_code, char *fmt, ...);
  70 
  71 /******************
  72  * default_hnp module
  73  ******************/
  74 orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
  75     .init = init,
  76     .finalize = finalize,
  77     .logfn = orte_errmgr_base_log,
  78     .abort = hnp_abort,
  79     .abort_peers = orte_errmgr_base_abort_peers
  80 };
  81 
  82 
  83 /*
  84  * Local functions
  85  */
  86 static void default_hnp_abort(orte_job_t *jdata);
  87 static void job_errors(int fd, short args, void *cbdata);
  88 static void proc_errors(int fd, short args, void *cbdata);
  89 
  90 /**********************
  91  * From DEFAULT_HNP
  92  **********************/
  93 static int init(void)
  94 {
  95     /* setup state machine to trap job errors */
  96     orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
  97 
  98     /* set the lost connection state to run at MSG priority so
  99      * we can process any last messages from the proc
 100      */
 101     orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
 102 
 103     /* setup state machine to trap proc errors */
 104     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
 105 
 106     return ORTE_SUCCESS;
 107 }
 108 
 109 static int finalize(void)
 110 {
 111     return ORTE_SUCCESS;
 112 }
 113 
 114 static void wakeup(int sd, short args, void *cbdata)
 115 {
 116     /* nothing more we can do */
 117     ORTE_ACQUIRE_OBJECT(cbdata);
 118     orte_quit(0, 0, NULL);
 119 }
 120 
 121 /* this function only gets called when FORCED_TERMINATE
 122  * has been invoked, which means that there is some
 123  * internal failure (e.g., to pack/unpack a correct value).
 124  * We could just exit, but that doesn't result in any
 125  * meaningful error message to the user. Likewise, just
 126  * printing something to stdout/stderr won't necessarily
 127  * get back to the user. Instead, we will send an error
 128  * report to mpirun and give it a chance to order our
 129  * termination. In order to ensure we _do_ terminate,
 130  * we set a timer - if it fires before we receive the
 131  * termination command, then we will exit on our own. This
 132  * protects us in the case that the failure is in the
 133  * messaging system itself */
 134 static void hnp_abort(int error_code, char *fmt, ...)
 135 {
 136     va_list arglist;
 137     char *outmsg = NULL;
 138     orte_timer_t *timer;
 139 
 140     /* only do this once */
 141     if (orte_abnormal_term_ordered) {
 142         return;
 143     }
 144 
 145     /* ensure we exit with non-zero status */
 146     ORTE_UPDATE_EXIT_STATUS(error_code);
 147 
 148     /* set the aborting flag */
 149     orte_abnormal_term_ordered = true;
 150 
 151     /* If there was a message, construct it */
 152     va_start(arglist, fmt);
 153     if (NULL != fmt) {
 154         opal_vasprintf(&outmsg, fmt, arglist);
 155     }
 156     va_end(arglist);
 157 
 158     /* use the show-help system to get the message out */
 159     orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
 160 
 161     /* this could have happened very early, so see if it happened
 162      * before we started anything - if so, we can just finalize */
 163     if (orte_never_launched) {
 164         orte_quit(0, 0, NULL);
 165         return;
 166     }
 167 
 168     /* tell the daemons to terminate */
 169     if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
 170         orte_quit(0, 0, NULL);
 171         return;
 172     }
 173 
 174     /* set a timer for exiting - this also gives the message a chance
 175      * to get out! */
 176     if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
 177         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 178         return;
 179     }
 180     timer->tv.tv_sec = 5;
 181     timer->tv.tv_usec = 0;
 182     opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
 183     opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
 184     ORTE_POST_OBJECT(timer);
 185     opal_event_evtimer_add(timer->ev, &timer->tv);
 186 }
 187 
 188 
 189 static void job_errors(int fd, short args, void *cbdata)
 190 {
 191     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 192     orte_job_t *jdata;
 193     orte_job_state_t jobstate;
 194     orte_exit_code_t sts;
 195     orte_proc_t *aborted_proc;
 196     opal_buffer_t *answer;
 197     int32_t rc, ret;
 198     int room, *rmptr;
 199 
 200     ORTE_ACQUIRE_OBJECT(caddy);
 201 
 202     /*
 203      * if orte is trying to shutdown, just let it
 204      */
 205     if (orte_finalizing) {
 206         return;
 207     }
 208 
 209     /* ensure we have an error exit status */
 210     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 211 
 212     /* if the jdata is NULL, then we abort as this
 213      * is reporting an unrecoverable error
 214      */
 215     if (NULL == caddy->jdata) {
 216         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
 217         OBJ_RELEASE(caddy);
 218         return;
 219     }
 220 
 221     /* update the state */
 222     jdata = caddy->jdata;
 223     jobstate = caddy->job_state;
 224     jdata->state = jobstate;
 225 
 226     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 227                          "%s errmgr:default_hnp: job %s reported state %s",
 228                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 229                          ORTE_JOBID_PRINT(jdata->jobid),
 230                          orte_job_state_to_str(jobstate)));
 231 
 232     if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
 233         ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
 234         ORTE_JOB_STATE_MAP_FAILED == jobstate ||
 235         ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
 236         if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
 237             /* this is the primary job */
 238             orte_never_launched = true;
 239         }
 240         /* disable routing as we may not have performed the daemon
 241          * wireup - e.g., in a managed environment, all the daemons
 242          * "phone home", but don't actually wireup into the routed
 243          * network until they receive the launch message
 244          */
 245         orte_routing_is_enabled = false;
 246         jdata->num_terminated = jdata->num_procs;
 247         /* activate the terminated state so we can exit */
 248         ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
 249         /* if it was a dynamic spawn, then we better tell them this didn't work */
 250         if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
 251             rc = jobstate;
 252             answer = OBJ_NEW(opal_buffer_t);
 253             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
 254                 ORTE_ERROR_LOG(ret);
 255                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 256                 OBJ_RELEASE(caddy);
 257                 return;
 258             }
 259             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
 260                 ORTE_ERROR_LOG(ret);
 261                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 262                 OBJ_RELEASE(caddy);
 263                 return;
 264             }
 265             /* pack the room number */
 266             rmptr = &room;
 267             if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
 268                 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
 269                     ORTE_ERROR_LOG(ret);
 270                     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 271                     OBJ_RELEASE(caddy);
 272                     return;
 273                 }
 274             }
 275             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 276                                  "%s errmgr:hnp sending dyn error release of job %s to %s",
 277                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 278                                  ORTE_JOBID_PRINT(jdata->jobid),
 279                                  ORTE_NAME_PRINT(&jdata->originator)));
 280             if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
 281                                                    ORTE_RML_TAG_LAUNCH_RESP,
 282                                                    orte_rml_send_callback, NULL))) {
 283                 ORTE_ERROR_LOG(ret);
 284                 OBJ_RELEASE(answer);
 285                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 286             }
 287         }
 288         OBJ_RELEASE(caddy);
 289         return;
 290     }
 291 
 292     if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
 293         ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
 294         /* the job object for this job will have been NULL'd
 295          * in the array if the job was solely local. If it isn't
 296          * NULL, then we need to tell everyone else to die
 297          */
 298         aborted_proc = NULL;
 299         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
 300             sts = aborted_proc->exit_code;
 301             if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
 302                 if (WIFSIGNALED(sts)) { /* died on signal */
 303 #ifdef WCOREDUMP
 304                     if (WCOREDUMP(sts)) {
 305                         orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
 306                                        WTERMSIG(sts));
 307                         sts = WTERMSIG(sts);
 308                     } else {
 309                         orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
 310                                        WTERMSIG(sts));
 311                         sts = WTERMSIG(sts);
 312                     }
 313 #else
 314                     orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
 315                                    WTERMSIG(sts));
 316                     sts = WTERMSIG(sts);
 317 #endif /* WCOREDUMP */
 318                 } else {
 319                     orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
 320                                    WEXITSTATUS(sts));
 321                     sts = WEXITSTATUS(sts);
 322                 }
 323             }
 324         }
 325         /* if this is the daemon job, then we need to ensure we
 326          * output an error message indicating we couldn't launch the
 327          * daemons */
 328         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
 329             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
 330         }
 331     }
 332 
 333     /* if the daemon job aborted and we haven't heard from everyone yet,
 334      * then this could well have been caused by a daemon not finding
 335      * a way back to us. In this case, output a message indicating a daemon
 336      * died without reporting. Otherwise, say nothing as we
 337      * likely already output an error message */
 338     if (ORTE_JOB_STATE_ABORTED == jobstate &&
 339         jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
 340         jdata->num_procs != jdata->num_reported) {
 341         orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
 342     }
 343 
 344     /* abort the job */
 345     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
 346     /* set the global abnormal exit flag  */
 347     orte_abnormal_term_ordered = true;
 348     OBJ_RELEASE(caddy);
 349 }
 350 
 351 static void proc_errors(int fd, short args, void *cbdata)
 352 {
 353     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 354     orte_job_t *jdata;
 355     orte_proc_t *pptr, *proct;
 356     orte_process_name_t *proc = &caddy->name;
 357     orte_proc_state_t state = caddy->proc_state;
 358     int i;
 359     int32_t i32, *i32ptr;
 360 
 361     ORTE_ACQUIRE_OBJECT(caddy);
 362 
 363     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 364                          "%s errmgr:default_hnp: for proc %s state %s",
 365                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 366                          ORTE_NAME_PRINT(proc),
 367                          orte_proc_state_to_str(state)));
 368 
 369     /*
 370      * if orte is trying to shutdown, just let it
 371      */
 372     if (orte_finalizing) {
 373         goto cleanup;
 374     }
 375 
 376     /* get the job object */
 377     if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
 378         /* could be a race condition */
 379         goto cleanup;
 380     }
 381     pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
 382 
 383     /* we MUST handle a communication failure before doing anything else
 384      * as it requires some special care to avoid normal termination issues
 385      * for local application procs
 386      */
 387     if (ORTE_PROC_STATE_COMM_FAILED == state) {
 388         /* is this to a daemon? */
 389         if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
 390             /* nope - ignore it */
 391             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 392                                  "%s Comm failure to non-daemon proc - ignoring it",
 393                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 394             goto cleanup;
 395         }
 396         /* if this is my own connection, ignore it */
 397         if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
 398             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 399                                  "%s Comm failure on my own connection - ignoring it",
 400                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 401             goto cleanup;
 402         }
 403         /* mark the daemon as gone */
 404         ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
 405         /* if we have ordered orteds to terminate or abort
 406          * is in progress, record it */
 407         if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
 408             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 409                                  "%s Comm failure: daemons terminating - recording daemon %s as gone",
 410                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
 411             /* remove from dependent routes, if it is one */
 412             orte_routed.route_lost(proc);
 413             /* if all my routes and local children are gone, then terminate ourselves */
 414             if (0 == orte_routed.num_routes()) {
 415                 for (i=0; i < orte_local_children->size; i++) {
 416                     if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
 417                         ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
 418                         /* at least one is still alive */
 419                         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 420                                              "%s Comm failure: at least one proc (%s) still alive",
 421                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 422                                              ORTE_NAME_PRINT(&proct->name)));
 423                         goto cleanup;
 424                     }
 425                 }
 426                 /* call our appropriate exit procedure */
 427                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 428                                      "%s errmgr_hnp: all routes and children gone - ordering exit",
 429                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 430                 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 431             } else {
 432                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 433                                      "%s Comm failure: %d routes remain alive",
 434                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 435                                      (int)orte_routed.num_routes()));
 436             }
 437             goto cleanup;
 438         }
 439         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 440                              "%s Comm failure: daemon %s - aborting",
 441                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
 442         /* record the first one to fail */
 443         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 444             /* mark the daemon job as failed */
 445             jdata->state = ORTE_JOB_STATE_COMM_FAILED;
 446             /* point to the lowest rank to cause the problem */
 447             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 448             /* retain the object so it doesn't get free'd */
 449             OBJ_RETAIN(pptr);
 450             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 451             if (!orte_enable_recovery) {
 452                 /* output an error message so the user knows what happened */
 453                 orte_show_help("help-errmgr-base.txt", "node-died", true,
 454                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 455                                orte_process_info.nodename,
 456                                ORTE_NAME_PRINT(proc),
 457                                pptr->node->name);
 458                 /* update our exit code */
 459                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 460                 /* just in case the exit code hadn't been set, do it here - this
 461                  * won't override any reported exit code */
 462                 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
 463             }
 464         }
 465         /* if recovery is enabled, then we are done - otherwise,
 466          * abort the system */
 467         if (!orte_enable_recovery) {
 468             default_hnp_abort(jdata);
 469         }
 470         goto cleanup;
 471     }
 472 
 473     /* update the proc state - can get multiple reports on a proc
 474      * depending on circumstances, so ensure we only do this once
 475      */
 476     if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
 477         pptr->state = state;
 478     }
 479 
 480     /* if we were ordered to terminate, mark this proc as dead and see if
 481      * any of our routes or local children remain alive - if not, then
 482      * terminate ourselves. */
 483     if (orte_orteds_term_ordered) {
 484         for (i=0; i < orte_local_children->size; i++) {
 485             if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 486                 if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
 487                     goto keep_going;
 488                 }
 489             }
 490         }
 491         /* if all my routes and children are gone, then terminate
 492            ourselves nicely (i.e., this is a normal termination) */
 493         if (0 == orte_routed.num_routes()) {
 494             OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 495                                  "%s errmgr:default:hnp all routes gone - exiting",
 496                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 497             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 498         }
 499     }
 500 
 501   keep_going:
 502     /* if this is a continuously operating job, then there is nothing more
 503      * to do - we let the job continue to run */
 504     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
 505         ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
 506         /* always mark the waitpid as having fired */
 507         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
 508         /* if this is a remote proc, we won't hear anything more about it
 509          * as the default behavior would be to terminate the job. So be sure to
 510          * mark the IOF as having completed too so we correctly mark this proc
 511          * as dead and notify everyone as required */
 512         if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
 513             ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_IOF_COMPLETE);
 514         }
 515         goto cleanup;
 516     }
 517 
 518     /* ensure we record the failed proc properly so we can report
 519      * the error once we terminate
 520      */
 521     switch (state) {
 522     case ORTE_PROC_STATE_KILLED_BY_CMD:
 523         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 524                              "%s errmgr:hnp: proc %s killed by cmd",
 525                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 526                              ORTE_NAME_PRINT(proc)));
 527         /* we ordered this proc to die, so it isn't an abnormal termination
 528          * and we don't flag it as such
 529          */
 530         if (jdata->num_terminated >= jdata->num_procs) {
 531             /* this job has terminated */
 532             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 533         }
 534         /* don't abort the job as this isn't an abnormal termination */
 535         break;
 536 
 537     case ORTE_PROC_STATE_ABORTED:
 538         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 539                              "%s errmgr:hnp: proc %s aborted",
 540                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 541                              ORTE_NAME_PRINT(proc)));
 542         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 543             jdata->state = ORTE_JOB_STATE_ABORTED;
 544             /* point to the first rank to cause the problem */
 545             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 546             /* retain the object so it doesn't get free'd */
 547             OBJ_RETAIN(pptr);
 548             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 549             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 550             /* abnormal termination - abort, but only do it once
 551              * to avoid creating a lot of confusion */
 552             default_hnp_abort(jdata);
 553         }
 554         break;
 555 
 556     case ORTE_PROC_STATE_ABORTED_BY_SIG:
 557         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 558                              "%s errmgr:hnp: proc %s aborted by signal",
 559                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 560                              ORTE_NAME_PRINT(proc)));
 561 
 562         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 563         /* track the number of non-zero exits */
 564         i32 = 0;
 565         i32ptr = &i32;
 566         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
 567         ++i32;
 568         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
 569         if (orte_abort_non_zero_exit) {
 570 
 571             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 572                 jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
 573                 /* point to the first rank to cause the problem */
 574                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 575                 /* retain the object so it doesn't get free'd */
 576                 OBJ_RETAIN(pptr);
 577                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 578                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 579                 /* abnormal termination - abort, but only do it once
 580                  * to avoid creating a lot of confusion */
 581                 default_hnp_abort(jdata);
 582             }
 583         } else {
 584             /* user requested we consider this normal termination */
 585             if (jdata->num_terminated >= jdata->num_procs) {
 586                 /* this job has terminated */
 587                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 588             }
 589         }
 590         break;
 591 
 592     case ORTE_PROC_STATE_TERM_WO_SYNC:
 593         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 594                              "%s errmgr:hnp: proc %s terminated without sync",
 595                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 596                              ORTE_NAME_PRINT(proc)));
 597         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 598             jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
 599             /* point to the first rank to cause the problem */
 600             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 601             /* retain the object so it doesn't get free'd */
 602             OBJ_RETAIN(pptr);
 603             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 604             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 605             /* now treat a special case - if the proc exit'd without a required
 606              * sync, it may have done so with a zero exit code. We want to ensure
 607              * that the user realizes there was an error, so in this -one- case,
 608              * we overwrite the process' exit code with the default error code
 609              */
 610             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 611             /* abnormal termination - abort, but only do it once
 612              * to avoid creating a lot of confusion */
 613             default_hnp_abort(jdata);
 614         }
 615         break;
 616 
 617     case ORTE_PROC_STATE_FAILED_TO_START:
 618     case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
 619         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 620                              "%s errmgr:hnp: proc %s %s",
 621                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 622                              ORTE_NAME_PRINT(proc),
 623                              orte_proc_state_to_str(state)));
 624         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 625             if (ORTE_PROC_STATE_FAILED_TO_START) {
 626                 jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
 627             } else {
 628                 jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
 629             }
 630             /* point to the first rank to cause the problem */
 631             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 632             /* retain the object so it doesn't get free'd */
 633             OBJ_RETAIN(pptr);
 634             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 635             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 636             /* abnormal termination - abort, but only do it once
 637              * to avoid creating a lot of confusion */
 638             default_hnp_abort(jdata);
 639         }
 640         /* if this was a daemon, report it */
 641         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
 642             /* output a message indicating we failed to launch a daemon */
 643             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
 644         }
 645         break;
 646 
 647     case ORTE_PROC_STATE_CALLED_ABORT:
 648         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 649                              "%s errmgr:hnp: proc %s called abort with exit code %d",
 650                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 651                              ORTE_NAME_PRINT(proc), pptr->exit_code));
 652         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 653             jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
 654             /* point to the first proc to cause the problem */
 655             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 656             /* retain the object so it doesn't get free'd */
 657             OBJ_RETAIN(pptr);
 658             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 659             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 660             /* abnormal termination - abort, but only do it once
 661              * to avoid creating a lot of confusion */
 662             default_hnp_abort(jdata);
 663         }
 664         break;
 665 
 666     case ORTE_PROC_STATE_TERM_NON_ZERO:
 667         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 668                              "%s errmgr:hnp: proc %s exited with non-zero status %d",
 669                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 670                              ORTE_NAME_PRINT(proc),
 671                              pptr->exit_code));
 672         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 673         /* track the number of non-zero exits */
 674         i32 = 0;
 675         i32ptr = &i32;
 676         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
 677         ++i32;
 678         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
 679         if (orte_abort_non_zero_exit) {
 680             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 681                 jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
 682                 /* point to the first rank to cause the problem */
 683                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 684                 /* retain the object so it doesn't get free'd */
 685                 OBJ_RETAIN(pptr);
 686                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 687                 /* abnormal termination - abort, but only do it once
 688                  * to avoid creating a lot of confusion */
 689                 default_hnp_abort(jdata);
 690             }
 691         } else {
 692             /* user requested we consider this normal termination */
 693             if (jdata->num_terminated >= jdata->num_procs) {
 694                 /* this job has terminated */
 695                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 696             }
 697         }
 698         break;
 699 
 700     case ORTE_PROC_STATE_HEARTBEAT_FAILED:
 701         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 702                              "%s errmgr:hnp: proc %s heartbeat failed",
 703                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 704                              ORTE_NAME_PRINT(proc)));
 705         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 706             jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
 707             /* point to the first rank to cause the problem */
 708             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
 709             /* retain the object so it doesn't get free'd */
 710             OBJ_RETAIN(pptr);
 711             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
 712             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
 713             /* abnormal termination - abort, but only do it once
 714              * to avoid creating a lot of confusion */
 715             default_hnp_abort(jdata);
 716         }
 717         /* remove from dependent routes, if it is one */
 718         orte_routed.route_lost(proc);
 719         break;
 720 
 721     case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
 722         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 723                              "%s errmgr:hnp: unable to send message to proc %s",
 724                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 725                              ORTE_NAME_PRINT(proc)));
 726         /* if this proc is one of my daemons, then we are truly
 727          * hosed - so just exit out
 728          */
 729         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 730             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 731             break;
 732         }
 733         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 734             /* abnormal termination - abort, but only do it once
 735              * to avoid creating a lot of confusion */
 736             default_hnp_abort(jdata);
 737         }
 738         break;
 739 
 740     case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
 741         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 742                              "%s errmgr:hnp: no message path to proc %s",
 743                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 744                              ORTE_NAME_PRINT(proc)));
 745         orte_show_help("help-errmgr-base.txt", "no-path", true,
 746                        orte_process_info.nodename, pptr->node->name);
 747         /* if this proc is one of my daemons, then we are truly
 748          * hosed - so just exit out
 749          */
 750         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 751             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 752             break;
 753         }
 754         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 755             /* abnormal termination - abort, but only do it once
 756              * to avoid creating a lot of confusion */
 757             default_hnp_abort(jdata);
 758         }
 759         break;
 760 
 761     case ORTE_PROC_STATE_FAILED_TO_CONNECT:
 762         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 763                              "%s errmgr:hnp: cannot connect to proc %s",
 764                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 765                              ORTE_NAME_PRINT(proc)));
 766         orte_show_help("help-errmgr-base.txt", "no-connect", true,
 767                        orte_process_info.nodename, pptr->node->name);
 768         /* if this proc is one of my daemons, then we are truly
 769          * hosed - so just exit out
 770          */
 771         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
 772             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 773             break;
 774         }
 775         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
 776             /* abnormal termination - abort, but only do it once
 777              * to avoid creating a lot of confusion */
 778             default_hnp_abort(jdata);
 779         }
 780         break;
 781 
 782     default:
 783         /* shouldn't get this, but terminate job if required */
 784         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 785                              "%s errmgr:hnp: proc %s default error %s",
 786                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 787                              ORTE_NAME_PRINT(proc),
 788                              orte_proc_state_to_str(state)));
 789         if (jdata->num_terminated == jdata->num_procs) {
 790             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
 791         }
 792         break;
 793     }
 794     /* if the waitpid fired, be sure to let the state machine know */
 795     if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
 796         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
 797     }
 798 
 799  cleanup:
 800     OBJ_RELEASE(caddy);
 801 }
 802 
 803 /*****************
 804  * Local Functions
 805  *****************/
 806 static void default_hnp_abort(orte_job_t *jdata)
 807 {
 808     int rc;
 809     int32_t i32, *i32ptr;
 810 
 811     /* if we are already in progress, then ignore this call */
 812     if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
 813         OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 814                              "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
 815                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 816                              ORTE_JOBID_PRINT(jdata->jobid)));
 817         return;
 818     }
 819 
 820     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 821                          "%s errmgr:default_hnp: abort called on job %s",
 822                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 823                          ORTE_JOBID_PRINT(jdata->jobid)));
 824 
 825     /* set control params to indicate we are terminating */
 826     orte_job_term_ordered = true;
 827     orte_enable_recovery = false;
 828 
 829     /* if it is the daemon job that aborted, then we need
 830      * to flag an abnormal term - otherwise, just abort
 831      * the job cleanly
 832      */
 833     if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
 834         orte_abnormal_term_ordered = true;
 835     }
 836 
 837     i32 = 0;
 838     i32ptr = &i32;
 839     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
 840         /* warn user */
 841         orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
 842                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
 843                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
 844                        i32, (1 == i32) ? "process returned\na non-zero exit code" :
 845                        "processes returned\nnon-zero exit codes");
 846     }
 847 
 848     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 849                          "%s errmgr:default_hnp: ordering orted termination",
 850                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 851 
 852     /* tell the plm to terminate the orteds - they will automatically
 853      * kill their local procs
 854      */
 855     if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
 856         ORTE_ERROR_LOG(rc);
 857     }
 858 }

/* [<][>][^][v][top][bottom][index][help] */