root/orte/mca/errmgr/default_orted/errmgr_default_orted.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. init
  2. finalize
  3. wakeup
  4. orted_abort
  5. job_errors
  6. proc_errors
  7. any_live_children
  8. pack_state_for_proc
  9. pack_state_update
  10. failed_start
  11. killprocs

   1 /*
   2  * Copyright (c) 2009-2010 The Trustees of Indiana University.
   3  *                         All rights reserved.
   4  * Copyright (c) 2010-2013 Cisco Systems, Inc.  All rights reserved.
   5  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
   6  * Copyright (c) 2004-2011 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  10  *                         All rights reserved.
  11  * Copyright (c) 2014-2019 Intel, Inc.  All rights reserved.
  12  * Copyright (c) 2017      IBM Corporation. All rights reserved.
  13  * $COPYRIGHT$
  14  *
  15  * Additional copyrights may follow
  16  *
  17  * $HEADER$
  18  */
  19 
  20 #include "orte_config.h"
  21 
  22 #include <sys/types.h>
  23 #ifdef HAVE_UNISTD_H
  24 #include <unistd.h>
  25 #endif  /* HAVE_UNISTD_H */
  26 #include <string.h>
  27 
  28 #include "opal/util/output.h"
  29 #include "opal/dss/dss.h"
  30 
  31 #include "orte/util/error_strings.h"
  32 #include "orte/util/name_fns.h"
  33 #include "orte/util/proc_info.h"
  34 #include "orte/util/session_dir.h"
  35 #include "orte/util/show_help.h"
  36 #include "orte/util/threads.h"
  37 
  38 #include "orte/mca/iof/base/base.h"
  39 #include "orte/mca/rml/rml.h"
  40 #include "orte/mca/odls/odls.h"
  41 #include "orte/mca/odls/base/base.h"
  42 #include "orte/mca/odls/base/odls_private.h"
  43 #include "orte/mca/plm/plm_types.h"
  44 #include "orte/mca/routed/routed.h"
  45 #include "orte/mca/ess/ess.h"
  46 #include "orte/mca/state/state.h"
  47 
  48 #include "orte/runtime/orte_wait.h"
  49 #include "orte/runtime/orte_quit.h"
  50 #include "orte/runtime/orte_globals.h"
  51 #include "orte/runtime/data_type_support/orte_dt_support.h"
  52 
  53 #include "orte/mca/errmgr/errmgr.h"
  54 #include "orte/mca/errmgr/base/base.h"
  55 #include "orte/mca/errmgr/base/errmgr_private.h"
  56 
  57 #include "errmgr_default_orted.h"
  58 
  59 /*
  60  * Module functions: Global
  61  */
  62 static int init(void);
  63 static int finalize(void);
  64 static void orted_abort(int error_code, char *fmt, ...);
  65 
  66 /******************
  67  * default_orted module
  68  ******************/
  69 orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
  70     .init = init,
  71     .finalize = finalize,
  72     .logfn = orte_errmgr_base_log,
  73     .abort = orted_abort,
  74     .abort_peers = orte_errmgr_base_abort_peers
  75 };
  76 
  77 /* Local functions */
  78 static bool any_live_children(orte_jobid_t job);
  79 static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat);
  80 static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child);
  81 static void failed_start(orte_job_t *jobdat);
  82 static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
  83 
  84 static void job_errors(int fd, short args, void *cbdata);
  85 static void proc_errors(int fd, short args, void *cbdata);
  86 
  87 /************************
  88  * API Definitions
  89  ************************/
  90 static int init(void)
  91 {
  92     /* setup state machine to trap job errors */
  93     orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
  94 
  95     /* set the lost connection state to run at MSG priority so
  96      * we can process any last messages from the proc
  97      */
  98     orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
  99 
 100     /* setup state machine to trap proc errors */
 101     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
 102 
 103     return ORTE_SUCCESS;
 104 }
 105 
 106 static int finalize(void)
 107 {
 108     return ORTE_SUCCESS;
 109 }
 110 
 111 static void wakeup(int sd, short args, void *cbdata)
 112 {
 113     /* nothing more we can do */
 114     ORTE_ACQUIRE_OBJECT(cbdata);
 115     orte_quit(0, 0, NULL);
 116 }
 117 
 118 /* this function only gets called when FORCED_TERMINATE
 119  * has been invoked, which means that there is some
 120  * internal failure (e.g., to pack/unpack a correct value).
 121  * We could just exit, but that doesn't result in any
 122  * meaningful error message to the user. Likewise, just
 123  * printing something to stdout/stderr won't necessarily
 124  * get back to the user. Instead, we will send an error
 125  * report to mpirun and give it a chance to order our
 126  * termination. In order to ensure we _do_ terminate,
 127  * we set a timer - if it fires before we receive the
 128  * termination command, then we will exit on our own. This
 129  * protects us in the case that the failure is in the
 130  * messaging system itself */
 131 static void orted_abort(int error_code, char *fmt, ...)
 132 {
 133     va_list arglist;
 134     char *outmsg = NULL;
 135     orte_plm_cmd_flag_t cmd;
 136     opal_buffer_t *alert;
 137     orte_vpid_t null=ORTE_VPID_INVALID;
 138     orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
 139     orte_timer_t *timer;
 140     int rc;
 141 
 142     /* only do this once */
 143     if (orte_abnormal_term_ordered) {
 144         return;
 145     }
 146 
 147     /* set the aborting flag */
 148     orte_abnormal_term_ordered = true;
 149 
 150     /* If there was a message, construct it */
 151     va_start(arglist, fmt);
 152     if (NULL != fmt) {
 153         opal_vasprintf(&outmsg, fmt, arglist);
 154     }
 155     va_end(arglist);
 156 
 157     /* use the show-help system to get the message out */
 158     orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
 159 
 160     /* tell the HNP we are in distress */
 161     alert = OBJ_NEW(opal_buffer_t);
 162     /* pack update state command */
 163     cmd = ORTE_PLM_UPDATE_PROC_STATE;
 164     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
 165         ORTE_ERROR_LOG(rc);
 166         OBJ_RELEASE(alert);
 167         goto cleanup;
 168     }
 169     /* pack the jobid */
 170     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
 171         ORTE_ERROR_LOG(rc);
 172         OBJ_RELEASE(alert);
 173         goto cleanup;
 174     }
 175     /* pack our vpid */
 176     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
 177         ORTE_ERROR_LOG(rc);
 178         OBJ_RELEASE(alert);
 179         goto cleanup;
 180     }
 181     /* pack our pid */
 182     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
 183         ORTE_ERROR_LOG(rc);
 184         OBJ_RELEASE(alert);
 185         goto cleanup;
 186     }
 187     /* pack our state */
 188     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
 189         ORTE_ERROR_LOG(rc);
 190         OBJ_RELEASE(alert);
 191         goto cleanup;
 192     }
 193     /* pack our exit code */
 194     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
 195         ORTE_ERROR_LOG(rc);
 196         OBJ_RELEASE(alert);
 197         goto cleanup;
 198     }
 199     /* flag that this job is complete so the receiver can know */
 200     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
 201         ORTE_ERROR_LOG(rc);
 202         OBJ_RELEASE(alert);
 203         goto cleanup;
 204     }
 205 
 206     /* send it */
 207     if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
 208                                           ORTE_RML_TAG_PLM,
 209                                           orte_rml_send_callback, NULL))) {
 210         ORTE_ERROR_LOG(rc);
 211         OBJ_RELEASE(alert);
 212         /* we can't communicate, so give up */
 213         orte_quit(0, 0, NULL);
 214         return;
 215     }
 216 
 217   cleanup:
 218     /* set a timer for exiting - this also gives the message a chance
 219      * to get out! */
 220     if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
 221         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 222         return;
 223     }
 224     timer->tv.tv_sec = 5;
 225     timer->tv.tv_usec = 0;
 226     opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
 227     opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
 228     ORTE_POST_OBJECT(timer);
 229     opal_event_evtimer_add(timer->ev, &timer->tv);
 230 
 231 }
 232 
 233 static void job_errors(int fd, short args, void *cbdata)
 234 {
 235     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 236     orte_job_t *jdata;
 237     orte_job_state_t jobstate;
 238     int rc;
 239     orte_plm_cmd_flag_t cmd;
 240     opal_buffer_t *alert;
 241 
 242     ORTE_ACQUIRE_OBJECT(caddy);
 243 
 244     /*
 245      * if orte is trying to shutdown, just let it
 246      */
 247     if (orte_finalizing) {
 248         return;
 249     }
 250 
 251     /* if the jdata is NULL, then we abort as this
 252      * is reporting an unrecoverable error
 253      */
 254     if (NULL == caddy->jdata) {
 255         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
 256         OBJ_RELEASE(caddy);
 257         return;
 258     }
 259 
 260     /* update the state */
 261     jdata = caddy->jdata;
 262     jobstate = caddy->job_state;
 263     jdata->state = jobstate;
 264 
 265     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 266                          "%s errmgr:default_orted: job %s reported error state %s",
 267                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 268                          ORTE_JOBID_PRINT(jdata->jobid),
 269                          orte_job_state_to_str(jobstate)));
 270 
 271     switch (jobstate) {
 272     case ORTE_JOB_STATE_FAILED_TO_START:
 273         failed_start(jdata);
 274         break;
 275     case ORTE_JOB_STATE_COMM_FAILED:
 276         /* kill all local procs */
 277         killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
 278         /* order termination */
 279         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 280         goto cleanup;
 281         break;
 282     case ORTE_JOB_STATE_HEARTBEAT_FAILED:
 283         /* let the HNP handle this */
 284         goto cleanup;
 285         break;
 286 
 287     default:
 288         break;
 289     }
 290     alert = OBJ_NEW(opal_buffer_t);
 291     /* pack update state command */
 292     cmd = ORTE_PLM_UPDATE_PROC_STATE;
 293     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
 294         ORTE_ERROR_LOG(rc);
 295         OBJ_RELEASE(alert);
 296         goto cleanup;
 297     }
 298     /* pack the job info */
 299     if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
 300         ORTE_ERROR_LOG(rc);
 301         OBJ_RELEASE(alert);
 302         goto cleanup;
 303     }
 304     /* send it */
 305     if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
 306                                           ORTE_RML_TAG_PLM,
 307                                           orte_rml_send_callback, NULL))) {
 308         ORTE_ERROR_LOG(rc);
 309         OBJ_RELEASE(alert);
 310     }
 311 
 312  cleanup:
 313     OBJ_RELEASE(caddy);
 314 }
 315 
 316 static void proc_errors(int fd, short args, void *cbdata)
 317 {
 318     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 319     orte_job_t *jdata;
 320     orte_process_name_t *proc = &caddy->name;
 321     orte_proc_state_t state = caddy->proc_state;
 322     orte_proc_t *child, *ptr;
 323     opal_buffer_t *alert;
 324     orte_plm_cmd_flag_t cmd;
 325     int rc=ORTE_SUCCESS;
 326     int i;
 327     orte_wait_tracker_t *t2;
 328 
 329     ORTE_ACQUIRE_OBJECT(caddy);
 330 
 331     OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 332                          "%s errmgr:default_orted:proc_errors process %s error state %s",
 333                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 334                          ORTE_NAME_PRINT(proc),
 335                          orte_proc_state_to_str(state)));
 336 
 337     /*
 338      * if orte is trying to shutdown, just let it
 339      */
 340     if (orte_finalizing) {
 341         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 342                              "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
 343                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 344         goto cleanup;
 345     }
 346 
 347     /* if this is a heartbeat failure, let the HNP handle it */
 348     if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
 349         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 350                              "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
 351                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 352         goto cleanup;
 353     }
 354 
 355     /* if this was a failed comm, then see if it was to our
 356      * lifeline
 357      */
 358     if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
 359         ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state ||
 360         ORTE_PROC_STATE_NO_PATH_TO_TARGET == state ||
 361         ORTE_PROC_STATE_PEER_UNKNOWN == state ||
 362         ORTE_PROC_STATE_FAILED_TO_CONNECT == state) {
 363         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 364                              "%s errmgr:orted lifeline lost or unable to communicate - exiting",
 365                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 366         /* set our exit status */
 367         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 368         /* kill our children */
 369         killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
 370         /* terminate - our routed children will see
 371          * us leave and automatically die
 372          */
 373         orte_quit(0, 0, NULL);
 374         goto cleanup;
 375     }
 376 
 377     /* get the job object */
 378     if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
 379         /* must already be complete */
 380         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 381                              "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
 382                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 383         goto cleanup;
 384     }
 385 
 386     if (ORTE_PROC_STATE_COMM_FAILED == state) {
 387         /* if it is our own connection, ignore it */
 388         if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
 389             OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 390                                  "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
 391                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 392             goto cleanup;
 393         }
 394         /* was it a daemon? */
 395         if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
 396             /* nope - we can't seem to trust that we will catch the waitpid
 397              * in this situation, so push this over to be handled as if
 398              * it were a waitpid trigger so we don't create a bunch of
 399              * duplicate code */
 400             OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 401                                  "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
 402                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 403             /* get the proc_t */
 404             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
 405                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 406                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 407                 goto cleanup;
 408             }
 409             /* leave the exit code alone - process this as a waitpid */
 410             t2 = OBJ_NEW(orte_wait_tracker_t);
 411             OBJ_RETAIN(child);  // protect against race conditions
 412             t2->child = child;
 413             t2->evb = orte_event_base;
 414             opal_event_set(t2->evb, &t2->ev, -1,
 415                            OPAL_EV_WRITE, orte_odls_base_default_wait_local_proc, t2);
 416             opal_event_set_priority(&t2->ev, ORTE_MSG_PRI);
 417             opal_event_active(&t2->ev, OPAL_EV_WRITE, 1);
 418             goto cleanup;
 419         }
 420         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 421                              "%s errmgr:default:orted daemon %s exited",
 422                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 423                              ORTE_NAME_PRINT(proc)));
 424 
 425         if (orte_orteds_term_ordered) {
 426             /* are any of my children still alive */
 427             for (i=0; i < orte_local_children->size; i++) {
 428                 if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 429                     if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
 430                         OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
 431                                              "%s errmgr:default:orted[%s(%d)] proc %s is alive",
 432                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 433                                              __FILE__, __LINE__,
 434                                              ORTE_NAME_PRINT(&child->name)));
 435                         goto cleanup;
 436                     }
 437                 }
 438             }
 439             /* if all my routes and children are gone, then terminate
 440                ourselves nicely (i.e., this is a normal termination) */
 441             if (0 == orte_routed.num_routes()) {
 442                 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 443                                      "%s errmgr:default:orted all routes gone - exiting",
 444                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 445                 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 446             } else {
 447                 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 448                                      "%s errmgr:default:orted not exiting, num_routes() == %d",
 449                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 450                                      (int)orte_routed.num_routes()));
 451             }
 452         }
 453         /* if not, then we can continue */
 454         goto cleanup;
 455     }
 456 
 457     if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
 458         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 459         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 460         goto cleanup;
 461     }
 462     /* if this is not a local proc for this job, we can
 463      * ignore this call
 464      */
 465     if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
 466         OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 467                              "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
 468                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 469         goto cleanup;
 470     }
 471 
 472     OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 473                          "%s errmgr:default_orted got state %s for proc %s",
 474                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 475                          orte_proc_state_to_str(state),
 476                          ORTE_NAME_PRINT(proc)));
 477 
 478     if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
 479         /* update the state */
 480         child->state = state;
 481         /* report this as abnormal termination to the HNP, unless we already have
 482          * done so for this job */
 483         if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
 484             alert = OBJ_NEW(opal_buffer_t);
 485             /* pack update state command */
 486             cmd = ORTE_PLM_UPDATE_PROC_STATE;
 487             if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
 488                 ORTE_ERROR_LOG(rc);
 489                 return;
 490             }
 491             /* pack only the data for this proc - have to start with the jobid
 492              * so the receiver can unpack it correctly
 493              */
 494             if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
 495                 ORTE_ERROR_LOG(rc);
 496                 return;
 497             }
 498 
 499             /* now pack the child's info */
 500             if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
 501                 ORTE_ERROR_LOG(rc);
 502                 return;
 503             }
 504             /* send it */
 505             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 506                                  "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
 507                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 508                                  ORTE_NAME_PRINT(&child->name),
 509                                  jdata->num_local_procs));
 510             if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
 511                                                   ORTE_RML_TAG_PLM,
 512                                                   orte_rml_send_callback, NULL))) {
 513                 ORTE_ERROR_LOG(rc);
 514                 OBJ_RELEASE(alert);
 515             }
 516             /* mark that we notified the HNP for this job so we don't do it again */
 517             orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
 518         }
 519         /* if the proc has terminated, notify the state machine */
 520         if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
 521             ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
 522             !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
 523             ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
 524         }
 525         goto cleanup;
 526     }
 527 
 528     if (ORTE_PROC_STATE_FAILED_TO_START == state ||
 529         ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
 530         /* update the proc state */
 531         child->state = state;
 532         /* count the proc as having "terminated" */
 533         jdata->num_terminated++;
 534         /* leave the error report in this case to the
 535          * state machine, which will receive notice
 536          * when all local procs have attempted to start
 537          * so that we send a consolidated error report
 538          * back to the HNP
 539          */
 540         if (jdata->num_local_procs == jdata->num_terminated) {
 541             /* let the state machine know */
 542             if (ORTE_PROC_STATE_FAILED_TO_START == state) {
 543                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
 544             } else {
 545                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
 546             }
 547         }
 548         goto cleanup;
 549     }
 550 
 551     if (ORTE_PROC_STATE_TERMINATED < state) {
 552         /* if we were ordered to terminate, see if
 553          * any of our routes or local children remain alive - if not, then
 554          * terminate ourselves. */
 555         if (orte_orteds_term_ordered) {
 556             /* mark the child as no longer alive and update the counters, if necessary.
 557              * we have to do this here as we aren't going to send this to the state
 558              * machine, and we want to keep the bookkeeping accurate just in case */
 559             if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
 560                 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
 561             }
 562             if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
 563                 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
 564                 jdata->num_terminated++;
 565             }
 566             for (i=0; i < orte_local_children->size; i++) {
 567                 if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 568                     if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
 569                         goto keep_going;
 570                     }
 571                 }
 572             }
 573             /* if all my routes and children are gone, then terminate
 574                ourselves nicely (i.e., this is a normal termination) */
 575             if (0 == orte_routed.num_routes()) {
 576                 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
 577                                      "%s errmgr:default:orted all routes gone - exiting",
 578                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 579                 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 580             }
 581             /* no need to alert the HNP - we are already on our way out */
 582             goto cleanup;
 583         }
 584 
 585     keep_going:
 586         /* if the job hasn't completed and the state is abnormally
 587          * terminated, then we need to alert the HNP right away - but
 588          * only do this once!
 589          */
 590         if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
 591             alert = OBJ_NEW(opal_buffer_t);
 592             /* pack update state command */
 593             cmd = ORTE_PLM_UPDATE_PROC_STATE;
 594             if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
 595                 ORTE_ERROR_LOG(rc);
 596                 return;
 597             }
 598             /* pack only the data for this proc - have to start with the jobid
 599              * so the receiver can unpack it correctly
 600              */
 601             if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
 602                 ORTE_ERROR_LOG(rc);
 603                 return;
 604             }
 605             child->state = state;
 606             /* now pack the child's info */
 607             if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
 608                 ORTE_ERROR_LOG(rc);
 609                 return;
 610             }
 611             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 612                                  "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
 613                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 614                                  ORTE_NAME_PRINT(&child->name),
 615                                  jdata->num_local_procs));
 616             /* send it */
 617             if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
 618                                                   ORTE_RML_TAG_PLM,
 619                                                   orte_rml_send_callback, NULL))) {
 620                 ORTE_ERROR_LOG(rc);
 621             }
 622             /* mark that we notified the HNP for this job so we don't do it again */
 623             orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
 624         }
 625         /* if the proc has terminated, notify the state machine */
 626         if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
 627             ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
 628             !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
 629             ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
 630         }
 631         goto cleanup;
 632     }
 633 
 634     /* only other state is terminated - see if anyone is left alive */
 635     if (!any_live_children(proc->jobid)) {
 636         alert = OBJ_NEW(opal_buffer_t);
 637         /* pack update state command */
 638         cmd = ORTE_PLM_UPDATE_PROC_STATE;
 639         if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
 640             ORTE_ERROR_LOG(rc);
 641             return;
 642         }
 643         /* pack the data for the job */
 644         if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
 645             ORTE_ERROR_LOG(rc);
 646             return;
 647         }
 648 
 649         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
 650                              "%s errmgr:default_orted reporting all procs in %s terminated",
 651                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 652                              ORTE_JOBID_PRINT(jdata->jobid)));
 653 
 654         /* remove all of this job's children from the global list */
 655         for (i=0; i < orte_local_children->size; i++) {
 656             if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 657                 continue;
 658             }
 659             if (jdata->jobid == ptr->name.jobid) {
 660                 opal_pointer_array_set_item(orte_local_children, i, NULL);
 661                 OBJ_RELEASE(ptr);
 662             }
 663         }
 664 
 665         /* ensure the job's local session directory tree is removed */
 666         orte_session_dir_cleanup(jdata->jobid);
 667 
 668         /* remove this job from our local job data since it is complete */
 669         OBJ_RELEASE(jdata);
 670 
 671         /* send it */
 672         if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
 673                                               ORTE_RML_TAG_PLM,
 674                                               orte_rml_send_callback, NULL))) {
 675             ORTE_ERROR_LOG(rc);
 676         }
 677         return;
 678     }
 679 
 680   cleanup:
 681     OBJ_RELEASE(caddy);
 682 }
 683 
 684 /*****************
 685  * Local Functions
 686  *****************/
 687 static bool any_live_children(orte_jobid_t job)
 688 {
 689     int i;
 690     orte_proc_t *child;
 691 
 692     for (i=0; i < orte_local_children->size; i++) {
 693         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 694             continue;
 695         }
 696         /* is this child part of the specified job? */
 697         if ((job == child->name.jobid || ORTE_JOBID_WILDCARD == job) &&
 698             ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
 699             return true;
 700         }
 701     }
 702 
 703     /* if we get here, then nobody is left alive from that job */
 704     return false;
 705 
 706 }
 707 
 708 static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child)
 709 {
 710     int rc;
 711 
 712     /* pack the child's vpid */
 713     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
 714         ORTE_ERROR_LOG(rc);
 715         return rc;
 716     }
 717     /* pack the pid */
 718     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
 719         ORTE_ERROR_LOG(rc);
 720         return rc;
 721     }
 722     /* pack its state */
 723     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
 724         ORTE_ERROR_LOG(rc);
 725         return rc;
 726     }
 727     /* pack its exit code */
 728     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
 729         ORTE_ERROR_LOG(rc);
 730         return rc;
 731     }
 732 
 733     return ORTE_SUCCESS;
 734 }
 735 
 736 static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat)
 737 {
 738     int rc, i;
 739     orte_proc_t *child;
 740     orte_vpid_t null=ORTE_VPID_INVALID;
 741 
 742     /* pack the jobid */
 743     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
 744         ORTE_ERROR_LOG(rc);
 745         return rc;
 746     }
 747     for (i=0; i < orte_local_children->size; i++) {
 748         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 749             continue;
 750         }
 751         /* if this child is part of the job... */
 752         if (child->name.jobid == jobdat->jobid) {
 753             if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
 754                 ORTE_ERROR_LOG(rc);
 755                 return rc;
 756             }
 757         }
 758     }
 759     /* flag that this job is complete so the receiver can know */
 760     if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
 761         ORTE_ERROR_LOG(rc);
 762         return rc;
 763     }
 764 
 765     return ORTE_SUCCESS;
 766 }
 767 
 768 static void failed_start(orte_job_t *jobdat)
 769 {
 770     int i;
 771     orte_proc_t *child;
 772 
 773     /* set the state */
 774     jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
 775 
 776     for (i=0; i < orte_local_children->size; i++) {
 777         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 778             continue;
 779         }
 780         /* is this child part of the specified job? */
 781         if (child->name.jobid == jobdat->jobid) {
 782             if (ORTE_PROC_STATE_FAILED_TO_START == child->state) {
 783                 /* this proc never launched - flag that the iof
 784                  * is complete or else we will hang waiting for
 785                  * pipes to close that were never opened
 786                  */
 787                 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
 788                 /* ditto for waitpid */
 789                 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
 790             }
 791         }
 792     }
 793     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 794                          "%s errmgr:hnp: job %s reported incomplete start",
 795                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 796                          ORTE_JOBID_PRINT(jobdat->jobid)));
 797     return;
 798 }
 799 
 800 static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
 801 {
 802     opal_pointer_array_t cmd;
 803     orte_proc_t proc;
 804     int rc;
 805 
 806     if (ORTE_JOBID_WILDCARD == job
 807         && ORTE_VPID_WILDCARD == vpid) {
 808         if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
 809             ORTE_ERROR_LOG(rc);
 810         }
 811         return;
 812     }
 813 
 814     OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
 815     OBJ_CONSTRUCT(&proc, orte_proc_t);
 816     proc.name.jobid = job;
 817     proc.name.vpid = vpid;
 818     opal_pointer_array_add(&cmd, &proc);
 819     if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
 820         ORTE_ERROR_LOG(rc);
 821     }
 822     OBJ_DESTRUCT(&cmd);
 823     OBJ_DESTRUCT(&proc);
 824 }

/* [<][>][^][v][top][bottom][index][help] */