root/orte/runtime/orte_quit.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_quit
  2. orte_print_aborted_job
  3. dump_aborted_procs

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2008 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
  15  * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  16  *                         reserved.
  17  * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
  18  * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  */
  25 
  26 #include "orte_config.h"
  27 #include "orte/constants.h"
  28 
  29 #include <string.h>
  30 #include <stdio.h>
  31 #ifdef HAVE_UNISTD_H
  32 #include <unistd.h>
  33 #endif
  34 #ifdef HAVE_SYS_PARAM_H
  35 #include <sys/param.h>
  36 #endif
  37 #include <errno.h>
  38 #include <signal.h>
  39 #include <ctype.h>
  40 #ifdef HAVE_SYS_TYPES_H
  41 #include <sys/types.h>
  42 #endif  /* HAVE_SYS_TYPES_H */
  43 #ifdef HAVE_SYS_WAIT_H
  44 #include <sys/wait.h>
  45 #endif  /* HAVE_SYS_WAIT_H */
  46 #ifdef HAVE_SYS_TIME_H
  47 #include <sys/time.h>
  48 #endif  /* HAVE_SYS_TIME_H */
  49 
  50 #include "orte/mca/plm/plm.h"
  51 #include "orte/mca/errmgr/errmgr.h"
  52 #include "orte/mca/routed/routed.h"
  53 #include "orte/mca/state/state.h"
  54 
  55 #include "orte/util/session_dir.h"
  56 #include "orte/util/show_help.h"
  57 #include "orte/util/threads.h"
  58 
  59 #include "orte/runtime/runtime.h"
  60 #include "orte/runtime/orte_globals.h"
  61 #include "orte/runtime/orte_quit.h"
  62 #include "orte/runtime/orte_locks.h"
  63 #include "orte/runtime/orte_data_server.h"
  64 
  65 /*
  66  * Globals
  67  */
  68 static int num_aborted = 0;
  69 static int num_killed = 0;
  70 static int num_failed_start = 0;
  71 static bool errors_reported = false;
  72 
  73 static void dump_aborted_procs(void);
  74 
  75 void orte_quit(int fd, short args, void *cbdata)
  76 {
  77     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
  78 
  79     ORTE_ACQUIRE_OBJECT(caddy);
  80 
  81     /* cleanup */
  82     if (NULL != caddy) {
  83         OBJ_RELEASE(caddy);
  84     }
  85 
  86     /* check one-time lock to protect against "bounce" */
  87     if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */
  88         return;
  89     }
  90 
  91     /* if we are the hnp and haven't already reported it, then
  92      * report any errors
  93      */
  94     if (ORTE_PROC_IS_HNP && !errors_reported) {
  95         if (0 != orte_exit_status && !orte_execute_quiet) {
  96             errors_reported = true;
  97             /* abnormal termination of some kind */
  98             dump_aborted_procs();
  99             /* If we showed more abort messages than were allowed,
 100                show a followup message here */
 101             if (num_failed_start > 1) {
 102                 if (orte_xml_output) {
 103                     fprintf(orte_xml_fp, "<stderr>");
 104                 }
 105                 fprintf(orte_xml_fp, "%d total process%s failed to start",
 106                         num_failed_start, ((num_failed_start > 1) ? "es" : ""));
 107                 if (orte_xml_output) {
 108                     fprintf(orte_xml_fp, "&#010;</stderr>");
 109                 }
 110                 fprintf(orte_xml_fp, "\n");
 111             }
 112             if (num_aborted > 1) {
 113                 if (orte_xml_output) {
 114                     fprintf(orte_xml_fp, "<stderr>");
 115                 }
 116                 fprintf(orte_xml_fp, "%d total process%s aborted",
 117                         num_aborted, ((num_aborted > 1) ? "es" : ""));
 118                 if (orte_xml_output) {
 119                     fprintf(orte_xml_fp, "&#010;</stderr>");
 120                 }
 121                 fprintf(orte_xml_fp, "\n");
 122             }
 123             if (num_killed > 1) {
 124                 if (orte_xml_output) {
 125                     fprintf(orte_xml_fp, "<stderr>");
 126                 }
 127                 fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
 128                         num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
 129                 if (orte_xml_output) {
 130                     fprintf(orte_xml_fp, "&#010;</stderr>");
 131                 }
 132                 fprintf(orte_xml_fp, "\n");
 133             }
 134         }
 135     }
 136 
 137     /* flag that the event lib should no longer be looped
 138      * so we will exit
 139      */
 140     orte_event_base_active = false;
 141     ORTE_POST_OBJECT(orte_event_base_active);
 142     /* break out of the event loop */
 143     opal_event_base_loopbreak(orte_event_base);
 144 }
 145 
 146 int orte_print_aborted_job(orte_job_t *job,
 147                            orte_app_context_t *approc,
 148                            orte_proc_t *proc,
 149                            orte_node_t *node)
 150 {
 151     if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
 152         ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
 153         switch (proc->exit_code) {
 154         case ORTE_ERR_SILENT:
 155             /* say nothing - it was already reported */
 156             break;
 157         case ORTE_ERR_SYS_LIMITS_PIPES:
 158             orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
 159                            orte_basename, node->name,
 160                            (unsigned long)proc->name.vpid);
 161             break;
 162         case ORTE_ERR_PIPE_SETUP_FAILURE:
 163             orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
 164                            orte_basename, node->name,
 165                            (unsigned long)proc->name.vpid);
 166             break;
 167         case ORTE_ERR_SYS_LIMITS_CHILDREN:
 168             orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
 169                            orte_basename, node->name,
 170                            (unsigned long)proc->name.vpid);
 171             break;
 172         case ORTE_ERR_FAILED_GET_TERM_ATTRS:
 173             orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
 174                            orte_basename, node->name,
 175                            (unsigned long)proc->name.vpid);
 176             break;
 177         case ORTE_ERR_WDIR_NOT_FOUND:
 178             orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
 179                            orte_basename, approc->cwd,
 180                            node->name, (unsigned long)proc->name.vpid);
 181             break;
 182         case ORTE_ERR_EXE_NOT_FOUND:
 183             orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
 184                            orte_basename,
 185                            (unsigned long)proc->name.vpid,
 186                            orte_basename,
 187                            orte_basename,
 188                            node->name,
 189                            approc->app);
 190             break;
 191         case ORTE_ERR_EXE_NOT_ACCESSIBLE:
 192             orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
 193                            orte_basename, approc->app, node->name,
 194                            (unsigned long)proc->name.vpid);
 195             break;
 196         case ORTE_ERR_MULTIPLE_AFFINITIES:
 197             orte_show_help("help-orterun.txt",
 198                            "orterun:multiple-paffinity-schemes", true, NULL);
 199             break;
 200         case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
 201             orte_show_help("help-orterun.txt",
 202                            "orterun:topo-not-supported",
 203                            true, orte_process_info.nodename, "rankfile containing a slot_list of ",
 204                            NULL, approc->app);
 205             break;
 206         case ORTE_ERR_INVALID_NODE_RANK:
 207             orte_show_help("help-orterun.txt",
 208                            "orterun:invalid-node-rank", true);
 209             break;
 210         case ORTE_ERR_INVALID_LOCAL_RANK:
 211             orte_show_help("help-orterun.txt",
 212                            "orterun:invalid-local-rank", true);
 213             break;
 214         case ORTE_ERR_NOT_ENOUGH_CORES:
 215             orte_show_help("help-orterun.txt",
 216                            "orterun:not-enough-resources", true,
 217                            "sockets", node->name,
 218                            "bind-to-core", approc->app);
 219             break;
 220         case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
 221             orte_show_help("help-orterun.txt",
 222                            "orterun:topo-not-supported",
 223                            true, node->name, "bind-to-core", "",
 224                            approc->app);
 225             break;
 226         case ORTE_ERR_INVALID_PHYS_CPU:
 227             orte_show_help("help-orterun.txt",
 228                            "orterun:invalid-phys-cpu", true);
 229             break;
 230         case ORTE_ERR_NOT_ENOUGH_SOCKETS:
 231             orte_show_help("help-orterun.txt",
 232                            "orterun:not-enough-resources", true,
 233                            "sockets", node->name,
 234                            "bind-to-socket", approc->app);
 235             break;
 236         case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
 237             orte_show_help("help-orterun.txt",
 238                            "orterun:topo-not-supported",
 239                            true, node->name, "bind-to-socket", "",
 240                            approc->app);
 241             break;
 242         case ORTE_ERR_MODULE_NOT_FOUND:
 243             orte_show_help("help-orterun.txt",
 244                            "orterun:paffinity-missing-module",
 245                            true, node->name);
 246             break;
 247         case ORTE_ERR_SLOT_LIST_RANGE:
 248             orte_show_help("help-orterun.txt",
 249                            "orterun:invalid-slot-list-range",
 250                            true, node->name, NULL);
 251             break;
 252         case ORTE_ERR_PIPE_READ_FAILURE:
 253             orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
 254                            orte_basename, node->name, (unsigned long)proc->name.vpid);
 255             break;
 256         case ORTE_ERR_SOCKET_NOT_AVAILABLE:
 257             orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
 258                            orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
 259                            (unsigned long)proc->name.vpid);
 260             break;
 261 
 262         default:
 263             if (0 != proc->exit_code) {
 264                 orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
 265                                orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
 266                                node->name, (unsigned long)proc->name.vpid);
 267             } else {
 268                 orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
 269                                orte_basename, node->name);
 270             }
 271         return ORTE_SUCCESS;
 272         }
 273     } else if (ORTE_JOB_STATE_ABORTED == job->state) {
 274         orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
 275                        orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
 276                        node->name, orte_basename);
 277         return ORTE_SUCCESS;
 278     } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) {  /* aborted by signal */
 279 #ifdef HAVE_STRSIGNAL
 280         if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
 281             orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
 282                            orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
 283                            node->name, WTERMSIG(proc->exit_code),
 284                            strsignal(WTERMSIG(proc->exit_code)));
 285         } else {
 286 #endif
 287             orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
 288                            orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
 289                            node->name, WTERMSIG(proc->exit_code));
 290 #ifdef HAVE_STRSIGNAL
 291         }
 292 #endif
 293         return ORTE_SUCCESS;
 294     } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
 295         orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
 296                        orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
 297                        node->name, orte_basename, orte_basename);
 298         return ORTE_SUCCESS;
 299     } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
 300         orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
 301                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 302                        ORTE_NAME_PRINT(&proc->name), node->name);
 303         return ORTE_SUCCESS;
 304     } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
 305         switch (proc->exit_code) {
 306         case ORTE_ERR_MEM_LIMIT_EXCEEDED:
 307             orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
 308                            ORTE_NAME_PRINT(&proc->name), node->name);
 309             break;
 310         case ORTE_ERR_PROC_STALLED:
 311             orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
 312             break;
 313 
 314         default:
 315             orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
 316         }
 317         return ORTE_SUCCESS;
 318     } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
 319         orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
 320                        orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
 321         return ORTE_SUCCESS;
 322     } else if (orte_abort_non_zero_exit &&
 323                ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
 324         orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
 325                        orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
 326         return ORTE_SUCCESS;
 327     }
 328 
 329     /* nothing here */
 330     return ORTE_ERR_NOT_FOUND;
 331 }
 332 
 333 /*
 334  * On abnormal termination - dump the
 335  * exit status of the aborted procs.
 336  */
 337 
 338 static void dump_aborted_procs(void)
 339 {
 340     orte_std_cntr_t n;
 341     orte_job_t *job;
 342     orte_std_cntr_t i;
 343     orte_proc_t *proc, *pptr;
 344     orte_app_context_t *approc;
 345     orte_node_t *node;
 346     uint32_t key;
 347     void *nptr;
 348 
 349     /* find the job that caused the problem */
 350     n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr);
 351     while (OPAL_SUCCESS == n) {
 352         if (NULL == job || job->jobid == ORTE_PROC_MY_NAME->jobid) {
 353             goto next;
 354         }
 355         if (ORTE_JOB_STATE_UNDEF != job->state &&
 356             ORTE_JOB_STATE_INIT != job->state &&
 357             ORTE_JOB_STATE_RUNNING != job->state &&
 358             ORTE_JOB_STATE_TERMINATED != job->state &&
 359             ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
 360 
 361             /* cycle through and count the number that were killed or aborted */
 362             for (i=0; i < job->procs->size; i++) {
 363                 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
 364                     /* array is left-justfied - we are done */
 365                     break;
 366                 }
 367                 if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
 368                     ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
 369                     ++num_failed_start;
 370                 } else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
 371                     ++num_aborted;
 372                 } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
 373                     ++num_killed;
 374                 } else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
 375                     ++num_killed;
 376                 }
 377             }
 378 
 379             /* see if there is a guilty party */
 380             proc = NULL;
 381             if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
 382                 NULL == proc) {
 383                 goto next;
 384             }
 385 
 386             approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
 387             node = proc->node;
 388             if (ORTE_SUCCESS == orte_print_aborted_job(job, approc, proc, node)) {
 389                 break;
 390             }
 391         }
 392       next:
 393         n = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&job, nptr, &nptr);
 394     }
 395 }

/* [<][>][^][v][top][bottom][index][help] */