This source file includes following definitions.
- register_cbfunc
- notify_cbfunc
- init
- finalize
- proc_errors
- abort_peers
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 #include "orte_config.h"
  21 
  22 #include <sys/types.h>
  23 #ifdef HAVE_UNISTD_H
  24 #include <unistd.h>
  25 #endif  
  26 #include <string.h>
  27 
  28 #include "opal/util/output.h"
  29 #include "opal/dss/dss.h"
  30 #include "opal/mca/pmix/pmix.h"
  31 
  32 #include "orte/util/error_strings.h"
  33 #include "orte/util/name_fns.h"
  34 #include "orte/util/show_help.h"
  35 #include "orte/util/threads.h"
  36 #include "orte/runtime/orte_globals.h"
  37 #include "orte/runtime/orte_wait.h"
  38 #include "orte/mca/rml/rml.h"
  39 #include "orte/mca/odls/odls_types.h"
  40 #include "orte/mca/state/state.h"
  41 
  42 #include "orte/mca/errmgr/base/base.h"
  43 #include "orte/mca/errmgr/base/errmgr_private.h"
  44 #include "errmgr_default_app.h"
  45 
  46 
  47 
  48 
  49  static int init(void);
  50  static int finalize(void);
  51 
  52  static int abort_peers(orte_process_name_t *procs,
  53                         orte_std_cntr_t num_procs,
  54                         int error_code);
  55 
  56 
  57 
  58 
  59  orte_errmgr_base_module_t orte_errmgr_default_app_module = {
  60     .init = init,
  61     .finalize = finalize,
  62     .logfn = orte_errmgr_base_log,
  63     .abort = orte_errmgr_base_abort,
  64     .abort_peers = abort_peers
  65 };
  66 
  67 static void proc_errors(int fd, short args, void *cbdata);
  68 
  69 static size_t myerrhandle = SIZE_MAX;
  70 
  71 static void register_cbfunc(int status, size_t errhndler, void *cbdata)
  72 {
  73     orte_lock_t *lk = (orte_lock_t*)cbdata;
  74     myerrhandle = errhndler;
  75     ORTE_POST_OBJECT(lk);
  76     ORTE_WAKEUP_THREAD(lk);
  77 }
  78 
  79 static void notify_cbfunc(int status,
  80                           const opal_process_name_t *source,
  81                           opal_list_t *info, opal_list_t *results,
  82                           opal_pmix_notification_complete_fn_t cbfunc, void *cbdata)
  83 {
  84     orte_proc_state_t state;
  85 
  86     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
  87                         "%s errmgr:default_app: pmix event handler called with status %s",
  88                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  89                         ORTE_ERROR_NAME(status)));
  90 
  91     
  92 
  93     switch(status) {
  94         case OPAL_ERR_PROC_ABORTED:
  95             state = ORTE_PROC_STATE_ABORTED;
  96             break;
  97         case OPAL_ERR_PROC_REQUESTED_ABORT:
  98             state = ORTE_PROC_STATE_CALLED_ABORT;
  99             break;
 100         default:
 101             state = ORTE_PROC_STATE_TERMINATED;
 102     }
 103 
 104     
 105     ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
 106 
 107     
 108 
 109     if (NULL != cbfunc) {
 110         cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
 111     }
 112 }
 113 
 114 
 115 
 116 
 117  static int init(void)
 118  {
 119     opal_list_t directives;
 120     orte_lock_t lock;
 121     opal_value_t *kv;
 122 
 123     
 124     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
 125 
 126     
 127     ORTE_CONSTRUCT_LOCK(&lock);
 128     OBJ_CONSTRUCT(&directives, opal_list_t);
 129     kv = OBJ_NEW(opal_value_t);
 130     kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
 131     kv->type = OPAL_STRING;
 132     kv->data.string = strdup("ORTE-APP-DEFAULT");
 133     opal_list_append(&directives, &kv->super);
 134     opal_pmix.register_evhandler(NULL, &directives, notify_cbfunc, register_cbfunc, (void*)&lock);
 135     ORTE_WAIT_THREAD(&lock);
 136     ORTE_DESTRUCT_LOCK(&lock);
 137     OPAL_LIST_DESTRUCT(&directives);
 138 
 139     return ORTE_SUCCESS;
 140 }
 141 
 142 static int finalize(void)
 143 {
 144     if (SIZE_MAX != myerrhandle) {
 145         opal_pmix.deregister_evhandler(myerrhandle, NULL, NULL);
 146         myerrhandle = SIZE_MAX;
 147     }
 148     return ORTE_SUCCESS;
 149 }
 150 
 151 static void proc_errors(int fd, short args, void *cbdata)
 152 {
 153     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 154     char *nodename;
 155 
 156     ORTE_ACQUIRE_OBJECT(caddy);
 157 
 158     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
 159                         "%s errmgr:default_app: proc %s state %s",
 160                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 161                         ORTE_NAME_PRINT(&caddy->name),
 162                         orte_proc_state_to_str(caddy->proc_state)));
 163 
 164     
 165 
 166 
 167      if (orte_finalizing) {
 168         OBJ_RELEASE(caddy);
 169         return;
 170     }
 171 
 172     if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
 173         
 174         nodename = orte_get_proc_hostname(&caddy->name);
 175         orte_show_help("help-errmgr-base",
 176                        "undeliverable-msg",
 177                        true, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 178                        orte_process_info.nodename,
 179                        ORTE_NAME_PRINT(&caddy->name),
 180                        (NULL == nodename) ? "Unknown" : nodename);
 181         
 182 
 183 
 184          orte_abnormal_term_ordered = true;
 185      } else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
 186         
 187         orte_abnormal_term_ordered = true;
 188     }
 189 
 190     OBJ_RELEASE(caddy);
 191 }
 192 
 193 static int abort_peers(orte_process_name_t *procs,
 194                        orte_std_cntr_t num_procs,
 195                        int error_code)
 196 {
 197     
 198     if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
 199         orte_errmgr_base_abort(error_code, "%s called abort_peers",
 200                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 201     } else {
 202         orte_errmgr_base_abort(error_code, NULL);
 203     }
 204     return ORTE_SUCCESS;
 205 }