1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
15 * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
16 * reserved.
17 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18 * Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
19 * $COPYRIGHT$
20 *
21 * Additional copyrights may follow
22 *
23 * $HEADER$
24 */
25 /** @file:
26 *
27 * The Open RTE Error and Recovery Manager (ErrMgr)
28 *
29 * This framework is the logically central clearing house for process/daemon
30 * state updates. In particular when a process fails and another process detects
31 * it, then that information is reported through this framework. This framework
32 * then (depending on the active component) decides how to handle the failure.
33 *
34 * For example, if a process fails this may activate an automatic recovery
35 * of the process from a previous checkpoint, or initial state. Conversely,
36 * the active component could decide not to continue the job, and request that
37 * it be terminated. The error and recovery policy is determined by individual
38 * components within this framework.
39 *
40 */
41
42 #ifndef ORTE_MCA_ERRMGR_H
43 #define ORTE_MCA_ERRMGR_H
44
45 /*
46 * includes
47 */
48
49 #include "orte_config.h"
50 #include "orte/constants.h"
51 #include "orte/types.h"
52
53 #include "orte/mca/mca.h"
54 #include "opal/mca/base/base.h"
55
56 #include "opal/class/opal_object.h"
57 #include "opal/class/opal_pointer_array.h"
58 #include "opal/util/output.h"
59 #include "opal/util/error.h"
60
61 #include "orte/runtime/orte_globals.h"
62 #include "orte/mca/plm/plm_types.h"
63
64 BEGIN_C_DECLS
65
66 /*
67 * Macro definitions
68 */
69 /*
70 * Thess macros and associated error name array are used to output intelligible error
71 * messages.
72 */
73
74 #define ORTE_ERROR_NAME(n) opal_strerror(n)
75 #define ORTE_ERROR_LOG(n) \
76 orte_errmgr.logfn(n, __FILE__, __LINE__);
77
78 /*
79 * Framework Interfaces
80 */
81 /**
82 * Module initialization function.
83 *
84 * @retval ORTE_SUCCESS The operation completed successfully
85 * @retval ORTE_ERROR An unspecifed error occurred
86 */
87 typedef int (*orte_errmgr_base_module_init_fn_t)(void);
88
89 /**
90 * Module finalization function.
91 *
92 * @retval ORTE_SUCCESS The operation completed successfully
93 * @retval ORTE_ERROR An unspecifed error occurred
94 */
95 typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
96
97 /**
98 * This is not part of any module so it can be used at any time!
99 */
100 typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
101
102 /**
103 * Alert - self aborting
104 * This function is called when a process is aborting due to some internal error.
105 * It will finalize the process
106 * itself, and then exit - it takes no other actions. The intent here is to provide
107 * a last-ditch exit procedure that attempts to clean up a little.
108 */
109 typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
110 __opal_attribute_format_funcptr__(__printf__, 2, 3);
111
112 /**
113 * Alert - abort peers
114 * This function is called when a process wants to abort one or more peer processes.
115 * For example, MPI_Abort(comm) will use this function to terminate peers in the
116 * communicator group before aborting itself.
117 */
118 typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
119 orte_std_cntr_t num_procs,
120 int error_code);
121
122 /*
123 * Module Structure
124 */
125 struct orte_errmgr_base_module_2_3_0_t {
126 /** Initialization Function */
127 orte_errmgr_base_module_init_fn_t init;
128 /** Finalization Function */
129 orte_errmgr_base_module_finalize_fn_t finalize;
130
131 orte_errmgr_base_module_log_fn_t logfn;
132 orte_errmgr_base_module_abort_fn_t abort;
133 orte_errmgr_base_module_abort_peers_fn_t abort_peers;
134 };
135 typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
136 typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
137 ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
138
139 /*
140 * ErrMgr Component
141 */
142 struct orte_errmgr_base_component_3_0_0_t {
143 /** MCA base component */
144 mca_base_component_t base_version;
145 /** MCA base data */
146 mca_base_component_data_t base_data;
147
148 /** Verbosity Level */
149 int verbose;
150 /** Output Handle for opal_output */
151 int output_handle;
152 /** Default Priority */
153 int priority;
154 };
155 typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
156 typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
157
158 /*
159 * Macro for use in components that are of type errmgr
160 */
161 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
162 ORTE_MCA_BASE_VERSION_2_1_0("errmgr", 3, 0, 0)
163
164 END_C_DECLS
165
166 #endif