This source file includes following definitions.
- register_cbfunc
- notify_cbfunc
- init
- finalize
- proc_errors
- abort_peers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 #include "orte_config.h"
21
22 #include <sys/types.h>
23 #ifdef HAVE_UNISTD_H
24 #include <unistd.h>
25 #endif
26 #include <string.h>
27
28 #include "opal/util/output.h"
29 #include "opal/dss/dss.h"
30 #include "opal/mca/pmix/pmix.h"
31
32 #include "orte/util/error_strings.h"
33 #include "orte/util/name_fns.h"
34 #include "orte/util/show_help.h"
35 #include "orte/util/threads.h"
36 #include "orte/runtime/orte_globals.h"
37 #include "orte/runtime/orte_wait.h"
38 #include "orte/mca/rml/rml.h"
39 #include "orte/mca/odls/odls_types.h"
40 #include "orte/mca/state/state.h"
41
42 #include "orte/mca/errmgr/base/base.h"
43 #include "orte/mca/errmgr/base/errmgr_private.h"
44 #include "errmgr_default_app.h"
45
46
47
48
49 static int init(void);
50 static int finalize(void);
51
52 static int abort_peers(orte_process_name_t *procs,
53 orte_std_cntr_t num_procs,
54 int error_code);
55
56
57
58
59 orte_errmgr_base_module_t orte_errmgr_default_app_module = {
60 .init = init,
61 .finalize = finalize,
62 .logfn = orte_errmgr_base_log,
63 .abort = orte_errmgr_base_abort,
64 .abort_peers = abort_peers
65 };
66
67 static void proc_errors(int fd, short args, void *cbdata);
68
69 static size_t myerrhandle = SIZE_MAX;
70
71 static void register_cbfunc(int status, size_t errhndler, void *cbdata)
72 {
73 orte_lock_t *lk = (orte_lock_t*)cbdata;
74 myerrhandle = errhndler;
75 ORTE_POST_OBJECT(lk);
76 ORTE_WAKEUP_THREAD(lk);
77 }
78
79 static void notify_cbfunc(int status,
80 const opal_process_name_t *source,
81 opal_list_t *info, opal_list_t *results,
82 opal_pmix_notification_complete_fn_t cbfunc, void *cbdata)
83 {
84 orte_proc_state_t state;
85
86 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
87 "%s errmgr:default_app: pmix event handler called with status %s",
88 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
89 ORTE_ERROR_NAME(status)));
90
91
92
93 switch(status) {
94 case OPAL_ERR_PROC_ABORTED:
95 state = ORTE_PROC_STATE_ABORTED;
96 break;
97 case OPAL_ERR_PROC_REQUESTED_ABORT:
98 state = ORTE_PROC_STATE_CALLED_ABORT;
99 break;
100 default:
101 state = ORTE_PROC_STATE_TERMINATED;
102 }
103
104
105 ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
106
107
108
109 if (NULL != cbfunc) {
110 cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
111 }
112 }
113
114
115
116
117 static int init(void)
118 {
119 opal_list_t directives;
120 orte_lock_t lock;
121 opal_value_t *kv;
122
123
124 orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
125
126
127 ORTE_CONSTRUCT_LOCK(&lock);
128 OBJ_CONSTRUCT(&directives, opal_list_t);
129 kv = OBJ_NEW(opal_value_t);
130 kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
131 kv->type = OPAL_STRING;
132 kv->data.string = strdup("ORTE-APP-DEFAULT");
133 opal_list_append(&directives, &kv->super);
134 opal_pmix.register_evhandler(NULL, &directives, notify_cbfunc, register_cbfunc, (void*)&lock);
135 ORTE_WAIT_THREAD(&lock);
136 ORTE_DESTRUCT_LOCK(&lock);
137 OPAL_LIST_DESTRUCT(&directives);
138
139 return ORTE_SUCCESS;
140 }
141
142 static int finalize(void)
143 {
144 if (SIZE_MAX != myerrhandle) {
145 opal_pmix.deregister_evhandler(myerrhandle, NULL, NULL);
146 myerrhandle = SIZE_MAX;
147 }
148 return ORTE_SUCCESS;
149 }
150
151 static void proc_errors(int fd, short args, void *cbdata)
152 {
153 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
154 char *nodename;
155
156 ORTE_ACQUIRE_OBJECT(caddy);
157
158 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
159 "%s errmgr:default_app: proc %s state %s",
160 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
161 ORTE_NAME_PRINT(&caddy->name),
162 orte_proc_state_to_str(caddy->proc_state)));
163
164
165
166
167 if (orte_finalizing) {
168 OBJ_RELEASE(caddy);
169 return;
170 }
171
172 if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
173
174 nodename = orte_get_proc_hostname(&caddy->name);
175 orte_show_help("help-errmgr-base",
176 "undeliverable-msg",
177 true, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
178 orte_process_info.nodename,
179 ORTE_NAME_PRINT(&caddy->name),
180 (NULL == nodename) ? "Unknown" : nodename);
181
182
183
184 orte_abnormal_term_ordered = true;
185 } else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
186
187 orte_abnormal_term_ordered = true;
188 }
189
190 OBJ_RELEASE(caddy);
191 }
192
193 static int abort_peers(orte_process_name_t *procs,
194 orte_std_cntr_t num_procs,
195 int error_code)
196 {
197
198 if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
199 orte_errmgr_base_abort(error_code, "%s called abort_peers",
200 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
201 } else {
202 orte_errmgr_base_abort(error_code, NULL);
203 }
204 return ORTE_SUCCESS;
205 }