root/ompi/errhandler/errhandler_predefined.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_mpi_errors_are_fatal_comm_handler
  2. ompi_mpi_errors_are_fatal_file_handler
  3. ompi_mpi_errors_are_fatal_win_handler
  4. ompi_mpi_errors_return_comm_handler
  5. ompi_mpi_errors_return_file_handler
  6. ompi_mpi_errors_return_win_handler
  7. out
  8. backend_fatal_aggregate
  9. backend_fatal_no_aggregate
  10. backend_fatal

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2014 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2006      University of Houston. All rights reserved.
  13  * Copyright (c) 2008-2018 Cisco Systems, Inc.  All rights reserved
  14  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
  15  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
  16  * Copyright (c) 2012      Los Alamos National Security, LLC.
  17  *                         All rights reserved.
  18  * Copyright (c) 2016      Intel, Inc.  All rights reserved.
  19  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 
  27 #include "ompi_config.h"
  28 #include <stdlib.h>
  29 #include <stdarg.h>
  30 #ifdef HAVE_UNISTD_H
  31 #include <unistd.h>
  32 #endif
  33 #ifdef HAVE_SYS_PARAM_H
  34 #include <sys/param.h>
  35 #endif
  36 
  37 #include "opal/util/show_help.h"
  38 #include "ompi/mca/rte/rte.h"
  39 #include "ompi/errhandler/errhandler_predefined.h"
  40 #include "ompi/errhandler/errcode.h"
  41 #include "ompi/communicator/communicator.h"
  42 #include "ompi/file/file.h"
  43 #include "ompi/win/win.h"
  44 #include "opal/util/printf.h"
  45 #include "opal/util/output.h"
  46 
  47 /*
  48  * Local functions
  49  */
  50 static void backend_fatal(char *type, struct ompi_communicator_t *comm,
  51                           char *name, int *error_code, va_list arglist);
  52 static void out(char *str, char *arg);
  53 
  54 
  55 void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
  56                                             int *error_code, ...)
  57 {
  58   char *name;
  59   struct ompi_communicator_t *abort_comm;
  60   va_list arglist;
  61 
  62   va_start(arglist, error_code);
  63 
  64   if (NULL != comm) {
  65       name = (*comm)->c_name;
  66       abort_comm = *comm;
  67   } else {
  68       name = NULL;
  69       abort_comm = NULL;
  70   }
  71   backend_fatal("communicator", abort_comm, name, error_code, arglist);
  72   va_end(arglist);
  73 }
  74 
  75 
  76 void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
  77                                             int *error_code, ...)
  78 {
  79   char *name;
  80   struct ompi_communicator_t *abort_comm;
  81   va_list arglist;
  82 
  83   va_start(arglist, error_code);
  84 
  85   if (NULL != file) {
  86       name = (*file)->f_filename;
  87       abort_comm = (*file)->f_comm;
  88   } else {
  89       name = NULL;
  90       abort_comm = NULL;
  91   }
  92   backend_fatal("file", abort_comm, name, error_code, arglist);
  93   va_end(arglist);
  94 }
  95 
  96 
  97 void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
  98                                            int *error_code, ...)
  99 {
 100   char *name;
 101   struct ompi_communicator_t *abort_comm = NULL;
 102   va_list arglist;
 103 
 104   va_start(arglist, error_code);
 105 
 106   if (NULL != win) {
 107       name = (*win)->w_name;
 108   } else {
 109       name = NULL;
 110   }
 111   backend_fatal("win", abort_comm, name, error_code, arglist);
 112   va_end(arglist);
 113 }
 114 
 115 void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
 116                                          int *error_code, ...)
 117 {
 118     /* Don't need anything more -- just need this function to exist */
 119     /* Silence some compiler warnings */
 120 
 121     va_list arglist;
 122     va_start(arglist, error_code);
 123     va_end(arglist);
 124 }
 125 
 126 
 127 void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
 128                                          int *error_code, ...)
 129 {
 130     /* Don't need anything more -- just need this function to exist */
 131     /* Silence some compiler warnings */
 132 
 133     va_list arglist;
 134     va_start(arglist, error_code);
 135     va_end(arglist);
 136 }
 137 
 138 
 139 void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
 140                                         int *error_code, ...)
 141 {
 142     /* Don't need anything more -- just need this function to exist */
 143     /* Silence some compiler warnings */
 144 
 145     va_list arglist;
 146     va_start(arglist, error_code);
 147     va_end(arglist);
 148 }
 149 
 150 
 151 static void out(char *str, char *arg)
 152 {
 153     if (ompi_rte_initialized &&
 154         ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
 155         if (NULL != arg) {
 156             opal_output(0, str, arg);
 157         } else {
 158             opal_output(0, "%s", str);
 159         }
 160     } else {
 161         if (NULL != arg) {
 162             fprintf(stderr, str, arg);
 163         } else {
 164             fprintf(stderr, "%s", str);
 165         }
 166     }
 167 }
 168 
 169 /*
 170  * Use opal_show_help() to aggregate the error messages (i.e., show it
 171  * once rather than N times).
 172  *
 173  * Note that this function will only be invoked for errors during the
 174  * MPI application (i.e., after MPI_INIT and before MPI_FINALIZE).  So
 175  * there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
 176  * errors here.
 177  */
 178 static void backend_fatal_aggregate(char *type,
 179                                     struct ompi_communicator_t *comm,
 180                                     char *name, int *error_code,
 181                                     va_list arglist)
 182 {
 183     char *arg = NULL, *prefix = NULL, *err_msg = NULL;
 184     const char* const unknown_error_code = "Error code: %d (no associated error message)";
 185     const char* const unknown_error = "Unknown error";
 186     const char* const unknown_prefix = "[?:?]";
 187     bool generated = false;
 188 
 189     // these do not own what they point to; they're
 190     // here to avoid repeating expressions such as
 191     // (NULL == foo) ? unknown_foo : foo
 192     const char* usable_prefix = unknown_prefix;
 193     const char* usable_err_msg = unknown_error;
 194 
 195     arg = va_arg(arglist, char*);
 196     va_end(arglist);
 197 
 198     if (opal_asprintf(&prefix, "[%s:%05d]",
 199                  ompi_process_info.nodename,
 200                  (int) ompi_process_info.pid) == -1) {
 201         prefix = NULL;
 202         // non-fatal, we could still go on to give useful information here...
 203         opal_output(0, "%s", "Could not write node and PID to prefix");
 204         opal_output(0, "Node: %s", ompi_process_info.nodename);
 205         opal_output(0, "PID: %d", (int) ompi_process_info.pid);
 206     }
 207 
 208     if (NULL != error_code) {
 209         err_msg = ompi_mpi_errnum_get_string(*error_code);
 210         if (NULL == err_msg) {
 211             if (opal_asprintf(&err_msg, unknown_error_code,
 212                          *error_code) == -1) {
 213                 err_msg = NULL;
 214                 opal_output(0, "%s", "Could not write to err_msg");
 215                 opal_output(0, unknown_error_code, *error_code);
 216             } else {
 217                 generated = true;
 218             }
 219         }
 220     }
 221 
 222     usable_prefix  = (NULL == prefix)  ? unknown_prefix : prefix;
 223     usable_err_msg = (NULL == err_msg) ? unknown_error  : err_msg;
 224 
 225     if (NULL != name) {
 226         opal_show_help("help-mpi-errors.txt",
 227                        "mpi_errors_are_fatal",
 228                        false,
 229                        usable_prefix,
 230                        (NULL == arg) ? "" : "in",
 231                        (NULL == arg) ? "" : arg,
 232                        usable_prefix,
 233                        OMPI_PROC_MY_NAME->jobid,
 234                        OMPI_PROC_MY_NAME->vpid,
 235                        usable_prefix,
 236                        type,
 237                        name,
 238                        usable_prefix,
 239                        usable_err_msg,
 240                        usable_prefix,
 241                        type,
 242                        usable_prefix);
 243     } else {
 244         opal_show_help("help-mpi-errors.txt",
 245                        "mpi_errors_are_fatal unknown handle",
 246                        false,
 247                        usable_prefix,
 248                        (NULL == arg) ? "" : "in",
 249                        (NULL == arg) ? "" : arg,
 250                        usable_prefix,
 251                        OMPI_PROC_MY_NAME->jobid,
 252                        OMPI_PROC_MY_NAME->vpid,
 253                        usable_prefix,
 254                        type,
 255                        usable_prefix,
 256                        usable_err_msg,
 257                        usable_prefix,
 258                        type,
 259                        usable_prefix);
 260     }
 261 
 262     free(prefix);
 263     if (generated) {
 264         free(err_msg);
 265     }
 266 }
 267 
 268 /*
 269  * Note that this function has to handle pre-MPI_INIT and
 270  * post-MPI_FINALIZE errors, which backend_fatal_aggregate() does not
 271  * have to handle.
 272  *
 273  * This function also intentionally does not call malloc(), just in
 274  * case we're being called due to some kind of stack/memory error --
 275  * we *might* be able to get a message out if we're not further
 276  * corrupting the stack by calling malloc()...
 277  */
 278 static void backend_fatal_no_aggregate(char *type,
 279                                        struct ompi_communicator_t *comm,
 280                                        char *name, int *error_code,
 281                                        va_list arglist)
 282 {
 283     char *arg;
 284 
 285     int32_t state = ompi_mpi_state;
 286     assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
 287            state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
 288 
 289     fflush(stdout);
 290     fflush(stderr);
 291 
 292     arg = va_arg(arglist, char*);
 293 
 294     /* Per #2152, print out in plain english if something was invoked
 295        before MPI_INIT* or after MPI_FINALIZE */
 296     if (state < OMPI_MPI_STATE_INIT_STARTED) {
 297         if (NULL != arg) {
 298             out("*** The %s() function was called before MPI_INIT was invoked.\n"
 299                 "*** This is disallowed by the MPI standard.\n", arg);
 300         } else {
 301             out("*** An MPI function was called before MPI_INIT was invoked.\n"
 302                 "*** This is disallowed by the MPI standard.\n"
 303                 "*** Unfortunately, no further information is available on *which* MPI\n"
 304                 "*** function was invoked, sorry.  :-(\n", NULL);
 305         }
 306         out("*** Your MPI job will now abort.\n", NULL);
 307     } else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
 308         if (NULL != arg) {
 309             out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
 310                 "*** This is disallowed by the MPI standard.\n", arg);
 311         } else {
 312             out("*** An MPI function was called after MPI_FINALIZE was invoked.\n"
 313                 "*** This is disallowed by the MPI standard.\n"
 314                 "*** Unfortunately, no further information is available on *which* MPI\n"
 315                 "*** function was invoked, sorry.  :-(\n", NULL);
 316         }
 317         out("*** Your MPI job will now abort.\n", NULL);
 318     }
 319 
 320     else {
 321         int len;
 322         char str[MPI_MAX_PROCESSOR_NAME * 2];
 323 
 324         /* THESE MESSAGES ARE COORDINATED WITH FIXED STRINGS IN
 325            help-mpi-errors.txt!  Do not change these messages without
 326            also changing help-mpi-errors.txt! */
 327 
 328         /* This is after MPI_INIT* and before MPI_FINALIZE, so print
 329            the error message normally */
 330         if (NULL != arg) {
 331             out("*** An error occurred in %s\n", arg);
 332         } else {
 333             out("*** An error occurred\n", NULL);
 334         }
 335 
 336         if (NULL != name) {
 337             /* Don't use opal_asprintf() here because there may be stack /
 338                heap corruption by the time we're invoked, so just do
 339                it on the stack */
 340             str[0] = '\0';
 341             len = sizeof(str) - 1;
 342             strncat(str, type, len);
 343 
 344             len -= strlen(type);
 345             if (len > 0) {
 346                 strncat(str, " ", len);
 347 
 348                 --len;
 349                 if (len > 0) {
 350                     strncat(str, name, len);
 351                 }
 352             }
 353             out("*** on %s", str);
 354         } else if (NULL == name) {
 355             out("*** on a NULL %s\n", type);
 356         }
 357 
 358         if (NULL != error_code) {
 359             char *tmp = ompi_mpi_errnum_get_string(*error_code);
 360             if (NULL != tmp) {
 361                 out("*** %s\n", tmp);
 362             } else {
 363                 char intbuf[32];
 364                 snprintf(intbuf, 32, "%d", *error_code);
 365                 out("*** Error code: %d (no associated error message)\n", intbuf);
 366             }
 367         }
 368         /* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
 369         out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
 370         out("***    and potentially your MPI job)\n", NULL);
 371 
 372     }
 373     va_end(arglist);
 374 }
 375 
 376 static void backend_fatal(char *type, struct ompi_communicator_t *comm,
 377                           char *name, int *error_code,
 378                           va_list arglist)
 379 {
 380     /* We only want aggregation while the rte is initialized */
 381     if (ompi_rte_initialized) {
 382         backend_fatal_aggregate(type, comm, name, error_code, arglist);
 383     } else {
 384         backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
 385     }
 386 
 387     /* In most instances the communicator will be valid. If not, we are either early in
 388      * the initialization or we are dealing with a window. Thus, it is good enough to abort
 389      * on MPI_COMM_SELF, the error will propagate.
 390      */
 391     if (comm == NULL) {
 392         comm = &ompi_mpi_comm_self.comm;
 393     }
 394 
 395     if (NULL != error_code) {
 396         ompi_mpi_abort(comm, *error_code);
 397     } else {
 398         ompi_mpi_abort(comm, 1);
 399     }
 400 }

/* [<][>][^][v][top][bottom][index][help] */