This source file includes following definitions.
- orte_quit
- orte_print_aborted_job
- dump_aborted_procs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include "orte_config.h"
27 #include "orte/constants.h"
28
29 #include <string.h>
30 #include <stdio.h>
31 #ifdef HAVE_UNISTD_H
32 #include <unistd.h>
33 #endif
34 #ifdef HAVE_SYS_PARAM_H
35 #include <sys/param.h>
36 #endif
37 #include <errno.h>
38 #include <signal.h>
39 #include <ctype.h>
40 #ifdef HAVE_SYS_TYPES_H
41 #include <sys/types.h>
42 #endif
43 #ifdef HAVE_SYS_WAIT_H
44 #include <sys/wait.h>
45 #endif
46 #ifdef HAVE_SYS_TIME_H
47 #include <sys/time.h>
48 #endif
49
50 #include "orte/mca/plm/plm.h"
51 #include "orte/mca/errmgr/errmgr.h"
52 #include "orte/mca/routed/routed.h"
53 #include "orte/mca/state/state.h"
54
55 #include "orte/util/session_dir.h"
56 #include "orte/util/show_help.h"
57 #include "orte/util/threads.h"
58
59 #include "orte/runtime/runtime.h"
60 #include "orte/runtime/orte_globals.h"
61 #include "orte/runtime/orte_quit.h"
62 #include "orte/runtime/orte_locks.h"
63 #include "orte/runtime/orte_data_server.h"
64
65
66
67
68 static int num_aborted = 0;
69 static int num_killed = 0;
70 static int num_failed_start = 0;
71 static bool errors_reported = false;
72
73 static void dump_aborted_procs(void);
74
75 void orte_quit(int fd, short args, void *cbdata)
76 {
77 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
78
79 ORTE_ACQUIRE_OBJECT(caddy);
80
81
82 if (NULL != caddy) {
83 OBJ_RELEASE(caddy);
84 }
85
86
87 if (opal_atomic_trylock(&orte_quit_lock)) {
88 return;
89 }
90
91
92
93
94 if (ORTE_PROC_IS_HNP && !errors_reported) {
95 if (0 != orte_exit_status && !orte_execute_quiet) {
96 errors_reported = true;
97
98 dump_aborted_procs();
99
100
101 if (num_failed_start > 1) {
102 if (orte_xml_output) {
103 fprintf(orte_xml_fp, "<stderr>");
104 }
105 fprintf(orte_xml_fp, "%d total process%s failed to start",
106 num_failed_start, ((num_failed_start > 1) ? "es" : ""));
107 if (orte_xml_output) {
108 fprintf(orte_xml_fp, "
</stderr>");
109 }
110 fprintf(orte_xml_fp, "\n");
111 }
112 if (num_aborted > 1) {
113 if (orte_xml_output) {
114 fprintf(orte_xml_fp, "<stderr>");
115 }
116 fprintf(orte_xml_fp, "%d total process%s aborted",
117 num_aborted, ((num_aborted > 1) ? "es" : ""));
118 if (orte_xml_output) {
119 fprintf(orte_xml_fp, "
</stderr>");
120 }
121 fprintf(orte_xml_fp, "\n");
122 }
123 if (num_killed > 1) {
124 if (orte_xml_output) {
125 fprintf(orte_xml_fp, "<stderr>");
126 }
127 fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
128 num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
129 if (orte_xml_output) {
130 fprintf(orte_xml_fp, "
</stderr>");
131 }
132 fprintf(orte_xml_fp, "\n");
133 }
134 }
135 }
136
137
138
139
140 orte_event_base_active = false;
141 ORTE_POST_OBJECT(orte_event_base_active);
142
143 opal_event_base_loopbreak(orte_event_base);
144 }
145
146 int orte_print_aborted_job(orte_job_t *job,
147 orte_app_context_t *approc,
148 orte_proc_t *proc,
149 orte_node_t *node)
150 {
151 if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
152 ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
153 switch (proc->exit_code) {
154 case ORTE_ERR_SILENT:
155
156 break;
157 case ORTE_ERR_SYS_LIMITS_PIPES:
158 orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
159 orte_basename, node->name,
160 (unsigned long)proc->name.vpid);
161 break;
162 case ORTE_ERR_PIPE_SETUP_FAILURE:
163 orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
164 orte_basename, node->name,
165 (unsigned long)proc->name.vpid);
166 break;
167 case ORTE_ERR_SYS_LIMITS_CHILDREN:
168 orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
169 orte_basename, node->name,
170 (unsigned long)proc->name.vpid);
171 break;
172 case ORTE_ERR_FAILED_GET_TERM_ATTRS:
173 orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
174 orte_basename, node->name,
175 (unsigned long)proc->name.vpid);
176 break;
177 case ORTE_ERR_WDIR_NOT_FOUND:
178 orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
179 orte_basename, approc->cwd,
180 node->name, (unsigned long)proc->name.vpid);
181 break;
182 case ORTE_ERR_EXE_NOT_FOUND:
183 orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
184 orte_basename,
185 (unsigned long)proc->name.vpid,
186 orte_basename,
187 orte_basename,
188 node->name,
189 approc->app);
190 break;
191 case ORTE_ERR_EXE_NOT_ACCESSIBLE:
192 orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
193 orte_basename, approc->app, node->name,
194 (unsigned long)proc->name.vpid);
195 break;
196 case ORTE_ERR_MULTIPLE_AFFINITIES:
197 orte_show_help("help-orterun.txt",
198 "orterun:multiple-paffinity-schemes", true, NULL);
199 break;
200 case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
201 orte_show_help("help-orterun.txt",
202 "orterun:topo-not-supported",
203 true, orte_process_info.nodename, "rankfile containing a slot_list of ",
204 NULL, approc->app);
205 break;
206 case ORTE_ERR_INVALID_NODE_RANK:
207 orte_show_help("help-orterun.txt",
208 "orterun:invalid-node-rank", true);
209 break;
210 case ORTE_ERR_INVALID_LOCAL_RANK:
211 orte_show_help("help-orterun.txt",
212 "orterun:invalid-local-rank", true);
213 break;
214 case ORTE_ERR_NOT_ENOUGH_CORES:
215 orte_show_help("help-orterun.txt",
216 "orterun:not-enough-resources", true,
217 "sockets", node->name,
218 "bind-to-core", approc->app);
219 break;
220 case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
221 orte_show_help("help-orterun.txt",
222 "orterun:topo-not-supported",
223 true, node->name, "bind-to-core", "",
224 approc->app);
225 break;
226 case ORTE_ERR_INVALID_PHYS_CPU:
227 orte_show_help("help-orterun.txt",
228 "orterun:invalid-phys-cpu", true);
229 break;
230 case ORTE_ERR_NOT_ENOUGH_SOCKETS:
231 orte_show_help("help-orterun.txt",
232 "orterun:not-enough-resources", true,
233 "sockets", node->name,
234 "bind-to-socket", approc->app);
235 break;
236 case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
237 orte_show_help("help-orterun.txt",
238 "orterun:topo-not-supported",
239 true, node->name, "bind-to-socket", "",
240 approc->app);
241 break;
242 case ORTE_ERR_MODULE_NOT_FOUND:
243 orte_show_help("help-orterun.txt",
244 "orterun:paffinity-missing-module",
245 true, node->name);
246 break;
247 case ORTE_ERR_SLOT_LIST_RANGE:
248 orte_show_help("help-orterun.txt",
249 "orterun:invalid-slot-list-range",
250 true, node->name, NULL);
251 break;
252 case ORTE_ERR_PIPE_READ_FAILURE:
253 orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
254 orte_basename, node->name, (unsigned long)proc->name.vpid);
255 break;
256 case ORTE_ERR_SOCKET_NOT_AVAILABLE:
257 orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
258 orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
259 (unsigned long)proc->name.vpid);
260 break;
261
262 default:
263 if (0 != proc->exit_code) {
264 orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
265 orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
266 node->name, (unsigned long)proc->name.vpid);
267 } else {
268 orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
269 orte_basename, node->name);
270 }
271 return ORTE_SUCCESS;
272 }
273 } else if (ORTE_JOB_STATE_ABORTED == job->state) {
274 orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
275 orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
276 node->name, orte_basename);
277 return ORTE_SUCCESS;
278 } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) {
279 #ifdef HAVE_STRSIGNAL
280 if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
281 orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
282 orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
283 node->name, WTERMSIG(proc->exit_code),
284 strsignal(WTERMSIG(proc->exit_code)));
285 } else {
286 #endif
287 orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
288 orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
289 node->name, WTERMSIG(proc->exit_code));
290 #ifdef HAVE_STRSIGNAL
291 }
292 #endif
293 return ORTE_SUCCESS;
294 } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) {
295 orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
296 orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
297 node->name, orte_basename, orte_basename);
298 return ORTE_SUCCESS;
299 } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
300 orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
301 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
302 ORTE_NAME_PRINT(&proc->name), node->name);
303 return ORTE_SUCCESS;
304 } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
305 switch (proc->exit_code) {
306 case ORTE_ERR_MEM_LIMIT_EXCEEDED:
307 orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
308 ORTE_NAME_PRINT(&proc->name), node->name);
309 break;
310 case ORTE_ERR_PROC_STALLED:
311 orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
312 break;
313
314 default:
315 orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
316 }
317 return ORTE_SUCCESS;
318 } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
319 orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
320 orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
321 return ORTE_SUCCESS;
322 } else if (orte_abort_non_zero_exit &&
323 ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
324 orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
325 orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
326 return ORTE_SUCCESS;
327 }
328
329
330 return ORTE_ERR_NOT_FOUND;
331 }
332
333
334
335
336
337
338 static void dump_aborted_procs(void)
339 {
340 orte_std_cntr_t n;
341 orte_job_t *job;
342 orte_std_cntr_t i;
343 orte_proc_t *proc, *pptr;
344 orte_app_context_t *approc;
345 orte_node_t *node;
346 uint32_t key;
347 void *nptr;
348
349
350 n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr);
351 while (OPAL_SUCCESS == n) {
352 if (NULL == job || job->jobid == ORTE_PROC_MY_NAME->jobid) {
353 goto next;
354 }
355 if (ORTE_JOB_STATE_UNDEF != job->state &&
356 ORTE_JOB_STATE_INIT != job->state &&
357 ORTE_JOB_STATE_RUNNING != job->state &&
358 ORTE_JOB_STATE_TERMINATED != job->state &&
359 ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
360
361
362 for (i=0; i < job->procs->size; i++) {
363 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
364
365 break;
366 }
367 if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
368 ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
369 ++num_failed_start;
370 } else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
371 ++num_aborted;
372 } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
373 ++num_killed;
374 } else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
375 ++num_killed;
376 }
377 }
378
379
380 proc = NULL;
381 if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
382 NULL == proc) {
383 goto next;
384 }
385
386 approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
387 node = proc->node;
388 if (ORTE_SUCCESS == orte_print_aborted_job(job, approc, proc, node)) {
389 break;
390 }
391 }
392 next:
393 n = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&job, nptr, &nptr);
394 }
395 }