root/orte/mca/errmgr/default_app/errmgr_default_app.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. register_cbfunc
  2. notify_cbfunc
  3. init
  4. finalize
  5. proc_errors
  6. abort_peers

   1 /*
   2  * Copyright (c) 2009-2011 The Trustees of Indiana University.
   3  *                         All rights reserved.
   4  *
   5  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
   6  *
   7  * Copyright (c) 2004-2018 The University of Tennessee and The University
   8  *                         of Tennessee Research Foundation.  All rights
   9  *                         reserved.
  10  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  11  *                         All rights reserved.
  12  * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
  13  * $COPYRIGHT$
  14  *
  15  * Additional copyrights may follow
  16  *
  17  * $HEADER$
  18  */
  19 
  20 #include "orte_config.h"
  21 
  22 #include <sys/types.h>
  23 #ifdef HAVE_UNISTD_H
  24 #include <unistd.h>
  25 #endif  /* HAVE_UNISTD_H */
  26 #include <string.h>
  27 
  28 #include "opal/util/output.h"
  29 #include "opal/dss/dss.h"
  30 #include "opal/mca/pmix/pmix.h"
  31 
  32 #include "orte/util/error_strings.h"
  33 #include "orte/util/name_fns.h"
  34 #include "orte/util/show_help.h"
  35 #include "orte/util/threads.h"
  36 #include "orte/runtime/orte_globals.h"
  37 #include "orte/runtime/orte_wait.h"
  38 #include "orte/mca/rml/rml.h"
  39 #include "orte/mca/odls/odls_types.h"
  40 #include "orte/mca/state/state.h"
  41 
  42 #include "orte/mca/errmgr/base/base.h"
  43 #include "orte/mca/errmgr/base/errmgr_private.h"
  44 #include "errmgr_default_app.h"
  45 
  46 /*
  47  * Module functions: Global
  48  */
  49  static int init(void);
  50  static int finalize(void);
  51 
  52  static int abort_peers(orte_process_name_t *procs,
  53                         orte_std_cntr_t num_procs,
  54                         int error_code);
  55 
  56 /******************
  57  * HNP module
  58  ******************/
  59  orte_errmgr_base_module_t orte_errmgr_default_app_module = {
  60     .init = init,
  61     .finalize = finalize,
  62     .logfn = orte_errmgr_base_log,
  63     .abort = orte_errmgr_base_abort,
  64     .abort_peers = abort_peers
  65 };
  66 
  67 static void proc_errors(int fd, short args, void *cbdata);
  68 
  69 static size_t myerrhandle = SIZE_MAX;
  70 
  71 static void register_cbfunc(int status, size_t errhndler, void *cbdata)
  72 {
  73     orte_lock_t *lk = (orte_lock_t*)cbdata;
  74     myerrhandle = errhndler;
  75     ORTE_POST_OBJECT(lk);
  76     ORTE_WAKEUP_THREAD(lk);
  77 }
  78 
  79 static void notify_cbfunc(int status,
  80                           const opal_process_name_t *source,
  81                           opal_list_t *info, opal_list_t *results,
  82                           opal_pmix_notification_complete_fn_t cbfunc, void *cbdata)
  83 {
  84     orte_proc_state_t state;
  85 
  86     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
  87                         "%s errmgr:default_app: pmix event handler called with status %s",
  88                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  89                         ORTE_ERROR_NAME(status)));
  90 
  91     /* we must convert the incoming status into an equivalent state
  92      * so we can activate the state machine */
  93     switch(status) {
  94         case OPAL_ERR_PROC_ABORTED:
  95             state = ORTE_PROC_STATE_ABORTED;
  96             break;
  97         case OPAL_ERR_PROC_REQUESTED_ABORT:
  98             state = ORTE_PROC_STATE_CALLED_ABORT;
  99             break;
 100         default:
 101             state = ORTE_PROC_STATE_TERMINATED;
 102     }
 103 
 104     /* push it into our event base */
 105     ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
 106 
 107     /* let the caller know we processed this, but allow the
 108      * chain to continue */
 109     if (NULL != cbfunc) {
 110         cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
 111     }
 112 }
 113 
 114 /************************
 115  * API Definitions
 116  ************************/
 117  static int init(void)
 118  {
 119     opal_list_t directives;
 120     orte_lock_t lock;
 121     opal_value_t *kv;
 122 
 123     /* setup state machine to trap proc errors */
 124     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
 125 
 126     /* tie the default PMIx event handler back to us */
 127     ORTE_CONSTRUCT_LOCK(&lock);
 128     OBJ_CONSTRUCT(&directives, opal_list_t);
 129     kv = OBJ_NEW(opal_value_t);
 130     kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
 131     kv->type = OPAL_STRING;
 132     kv->data.string = strdup("ORTE-APP-DEFAULT");
 133     opal_list_append(&directives, &kv->super);
 134     opal_pmix.register_evhandler(NULL, &directives, notify_cbfunc, register_cbfunc, (void*)&lock);
 135     ORTE_WAIT_THREAD(&lock);
 136     ORTE_DESTRUCT_LOCK(&lock);
 137     OPAL_LIST_DESTRUCT(&directives);
 138 
 139     return ORTE_SUCCESS;
 140 }
 141 
 142 static int finalize(void)
 143 {
 144     if (SIZE_MAX != myerrhandle) {
 145         opal_pmix.deregister_evhandler(myerrhandle, NULL, NULL);
 146         myerrhandle = SIZE_MAX;
 147     }
 148     return ORTE_SUCCESS;
 149 }
 150 
 151 static void proc_errors(int fd, short args, void *cbdata)
 152 {
 153     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 154     char *nodename;
 155 
 156     ORTE_ACQUIRE_OBJECT(caddy);
 157 
 158     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 159                         "%s errmgr:default_app: proc %s state %s",
 160                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 161                         ORTE_NAME_PRINT(&caddy->name),
 162                         orte_proc_state_to_str(caddy->proc_state)));
 163 
 164     /*
 165      * if orte is trying to shutdown, just let it
 166      */
 167      if (orte_finalizing) {
 168         OBJ_RELEASE(caddy);
 169         return;
 170     }
 171 
 172     if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
 173         /* we can't send a message - print a message */
 174         nodename = orte_get_proc_hostname(&caddy->name);
 175         orte_show_help("help-errmgr-base",
 176                        "undeliverable-msg",
 177                        true, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 178                        orte_process_info.nodename,
 179                        ORTE_NAME_PRINT(&caddy->name),
 180                        (NULL == nodename) ? "Unknown" : nodename);
 181         /* flag that we must abnormally terminate as far as the
 182          * RTE is concerned
 183          */
 184          orte_abnormal_term_ordered = true;
 185      } else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
 186         /* we need to die, so mark us so */
 187         orte_abnormal_term_ordered = true;
 188     }
 189 
 190     OBJ_RELEASE(caddy);
 191 }
 192 
 193 static int abort_peers(orte_process_name_t *procs,
 194                        orte_std_cntr_t num_procs,
 195                        int error_code)
 196 {
 197     /* just abort */
 198     if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
 199         orte_errmgr_base_abort(error_code, "%s called abort_peers",
 200                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 201     } else {
 202         orte_errmgr_base_abort(error_code, NULL);
 203     }
 204     return ORTE_SUCCESS;
 205 }

/* [<][>][^][v][top][bottom][index][help] */