root/orte/mca/errmgr/errmgr.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. __opal_attribute_format_funcptr__

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2011 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
  15  * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
  16  *                         reserved.
  17  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
  18  * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  */
  25 /** @file:
  26  *
  27  * The Open RTE Error and Recovery Manager (ErrMgr)
  28  *
  29  * This framework is the logically central clearing house for process/daemon
  30  * state updates. In particular when a process fails and another process detects
  31  * it, then that information is reported through this framework. This framework
  32  * then (depending on the active component) decides how to handle the failure.
  33  *
  34  * For example, if a process fails this may activate an automatic recovery
  35  * of the process from a previous checkpoint, or initial state. Conversely,
  36  * the active component could decide not to continue the job, and request that
  37  * it be terminated. The error and recovery policy is determined by individual
  38  * components within this framework.
  39  *
  40  */
  41 
  42 #ifndef ORTE_MCA_ERRMGR_H
  43 #define ORTE_MCA_ERRMGR_H
  44 
  45 /*
  46  * includes
  47  */
  48 
  49 #include "orte_config.h"
  50 #include "orte/constants.h"
  51 #include "orte/types.h"
  52 
  53 #include "orte/mca/mca.h"
  54 #include "opal/mca/base/base.h"
  55 
  56 #include "opal/class/opal_object.h"
  57 #include "opal/class/opal_pointer_array.h"
  58 #include "opal/util/output.h"
  59 #include "opal/util/error.h"
  60 
  61 #include "orte/runtime/orte_globals.h"
  62 #include "orte/mca/plm/plm_types.h"
  63 
  64 BEGIN_C_DECLS
  65 
  66 /*
  67  * Macro definitions
  68  */
  69 /*
  70  * Thess macros and associated error name array are used to output intelligible error
  71  * messages.
  72  */
  73 
  74 #define ORTE_ERROR_NAME(n)  opal_strerror(n)
  75 #define ORTE_ERROR_LOG(n)                       \
  76         orte_errmgr.logfn(n, __FILE__, __LINE__);
  77 
  78 /*
  79  * Framework Interfaces
  80  */
  81 /**
  82  * Module initialization function.
  83  *
  84  * @retval ORTE_SUCCESS The operation completed successfully
  85  * @retval ORTE_ERROR   An unspecifed error occurred
  86  */
  87 typedef int (*orte_errmgr_base_module_init_fn_t)(void);
  88 
  89 /**
  90  * Module finalization function.
  91  *
  92  * @retval ORTE_SUCCESS The operation completed successfully
  93  * @retval ORTE_ERROR   An unspecifed error occurred
  94  */
  95 typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
  96 
  97 /**
  98  * This is not part of any module so it can be used at any time!
  99  */
 100 typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
 101 
 102 /**
 103  * Alert - self aborting
 104  * This function is called when a process is aborting due to some internal error.
 105  * It will finalize the process
 106  * itself, and then exit - it takes no other actions. The intent here is to provide
 107  * a last-ditch exit procedure that attempts to clean up a little.
 108  */
 109 typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
 110 __opal_attribute_format_funcptr__(__printf__, 2, 3);
 111 
 112 /**
 113  * Alert - abort peers
 114  *  This function is called when a process wants to abort one or more peer processes.
 115  *  For example, MPI_Abort(comm) will use this function to terminate peers in the
 116  *  communicator group before aborting itself.
 117  */
 118 typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
 119                                                         orte_std_cntr_t num_procs,
 120                                                         int error_code);
 121 
 122 /*
 123  * Module Structure
 124  */
 125 struct orte_errmgr_base_module_2_3_0_t {
 126     /** Initialization Function */
 127     orte_errmgr_base_module_init_fn_t                       init;
 128     /** Finalization Function */
 129     orte_errmgr_base_module_finalize_fn_t                   finalize;
 130 
 131     orte_errmgr_base_module_log_fn_t                        logfn;
 132     orte_errmgr_base_module_abort_fn_t                      abort;
 133     orte_errmgr_base_module_abort_peers_fn_t                abort_peers;
 134 };
 135 typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
 136 typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
 137 ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
 138 
 139 /*
 140  * ErrMgr Component
 141  */
 142 struct orte_errmgr_base_component_3_0_0_t {
 143     /** MCA base component */
 144     mca_base_component_t base_version;
 145     /** MCA base data */
 146     mca_base_component_data_t base_data;
 147 
 148     /** Verbosity Level */
 149     int verbose;
 150     /** Output Handle for opal_output */
 151     int output_handle;
 152     /** Default Priority */
 153     int priority;
 154 };
 155 typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
 156 typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
 157 
 158 /*
 159  * Macro for use in components that are of type errmgr
 160  */
 161 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
 162     ORTE_MCA_BASE_VERSION_2_1_0("errmgr", 3, 0, 0)
 163 
 164 END_C_DECLS
 165 
 166 #endif

/* [<][>][^][v][top][bottom][index][help] */