1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana 4 * University Research and Technology 5 * Corporation. All rights reserved. 6 * Copyright (c) 2004-2005 The University of Tennessee and The University 7 * of Tennessee Research Foundation. All rights 8 * reserved. 9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 10 * University of Stuttgart. All rights reserved. 11 * Copyright (c) 2004-2005 The Regents of the University of California. 12 * All rights reserved. 13 * Copyright (c) 2007 Evergrid, Inc. All rights reserved. 14 * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. 15 * Copyright (c) 2015 Los Alamos National Security, LLC. All rights 16 * reserved. 17 * 18 * $COPYRIGHT$ 19 * 20 * Additional copyrights may follow 21 * 22 * $HEADER$ 23 */ 24 /** 25 * @file 26 * 27 * Checkpoint and Restart Service (CRS) Interface 28 * 29 * General Description: 30 * 31 * The OPAL Checkpoint and Restart Service (CRS) has been created to create an 32 * abstract notion of a single process checkpointer for upper levels to 33 * incorporate checkpoint/restart calls genericly into their code. This keeps 34 * the upper levels from becoming too tied to a specfic checkpoint and restart 35 * implementation. 36 * 37 * This interface will change in the future to allow for some additional 38 * specialized functionality such as memory inclusion/exclusion, explicit 39 * restarting while running, and others. 40 * 41 * Words to the Wise: 42 * 43 * The CRS module must adhere to the API exactly inorder to be fully supported. 44 * How the module goes about conforming to the API is an internal module issue 45 * and in no cases should the module impose restrictions upon the upper layers 46 * as this is an API violation. 47 * 48 */ 49 50 #ifndef MCA_CRS_H 51 #define MCA_CRS_H 52 53 #include "opal_config.h" 54 #include "opal/mca/mca.h" 55 #include "opal/mca/base/base.h" 56 #include "opal/class/opal_object.h" 57 58 BEGIN_C_DECLS 59 60 /** 61 * States of the module 62 */ 63 enum opal_crs_state_type_t { 64 OPAL_CRS_NONE = 0, 65 OPAL_CRS_CHECKPOINT = 1, 66 OPAL_CRS_RESTART_PRE = 2, 67 OPAL_CRS_RESTART = 3, /* RESTART_POST */ 68 OPAL_CRS_CONTINUE = 4, 69 OPAL_CRS_TERM = 5, 70 OPAL_CRS_RUNNING = 6, 71 OPAL_CRS_ERROR = 7, 72 OPAL_CRS_STATE_MAX = 8 73 }; 74 typedef enum opal_crs_state_type_t opal_crs_state_type_t; 75 76 /* 77 * Possible checkpoint options 78 */ 79 struct opal_crs_base_ckpt_options_1_0_0_t { 80 /** Parent is an object type */ 81 opal_object_t super; 82 83 /** Terminate after checkpoint */ 84 bool term; 85 /** Send SIGSTOP after checkpoint */ 86 bool stop; 87 88 /** INC Prep Only */ 89 bool inc_prep_only; 90 91 /** INC Recover Only */ 92 bool inc_recover_only; 93 94 #if OPAL_ENABLE_CRDEBUG == 1 95 /** Wait for debugger to attach after checkpoint */ 96 bool attach_debugger; 97 /** Do not wait for debugger to reattach after checkpoint */ 98 bool detach_debugger; 99 #endif 100 }; 101 typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_1_0_0_t; 102 typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_t; 103 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_ckpt_options_t); 104 105 /** 106 * Structure for Single process snapshot 107 * Each component is assumed to have extened this definition 108 * in the same way they exten the opal_crs_base_compoinent_t below. 109 */ 110 struct opal_crs_base_snapshot_1_0_0_t { 111 /** This is an object, so must have super */ 112 opal_list_item_t super; 113 114 /** MCA Component name */ 115 char * component_name; 116 117 /** Metadata filename */ 118 char * metadata_filename; 119 120 /** Metadata fd */ 121 FILE * metadata; 122 123 /** Absolute path the the snapshot directory */ 124 char * snapshot_directory; 125 126 /** Cold Start: 127 * If we are restarting cold, then we need to recreate this structure 128 * opal_restart would set this, and let the component do the heavy lifting 129 * of recreating the structure, sicne it doesn't know exactly how to. 130 */ 131 bool cold_start; 132 }; 133 typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t; 134 typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t; 135 136 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t); 137 138 /** 139 * Module initialization function. 140 * Returns OPAL_SUCCESS 141 */ 142 typedef int (*opal_crs_base_module_init_fn_t) 143 (void); 144 145 /** 146 * Module finalization function. 147 * Returns OPAL_SUCCESS 148 */ 149 typedef int (*opal_crs_base_module_finalize_fn_t) 150 (void); 151 152 /** 153 * Call the underlying checkpointer. 154 * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise. 155 * 156 * Arguments: 157 * pid = PID of the process to checkpoint, or 0 if checkpointing self. 158 * fname = the filename where the checkpoint has been written. 159 * state = The state at which the checkpoint is exiting 160 * - OPAL_CRS_CONTINUE 161 * Continuing after a checkpoint has been taken 162 * - OPAL_CRS_RESTART 163 * Restarting from a checkpoint 164 * - OPAL_CRS_ERROR 165 * Checkpoint was not successful. 166 * 167 * The 'fname' string is owned by the caller: if appropriate, it must be eventually 168 * freed by the caller. 169 */ 170 typedef int (*opal_crs_base_module_checkpoint_fn_t) 171 (pid_t pid, 172 opal_crs_base_snapshot_t *snapshot, 173 opal_crs_base_ckpt_options_t *options, 174 opal_crs_state_type_t *state); 175 176 /** 177 * Call the underlying restart command for this process 178 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR 179 * 180 * Arguments: 181 * fname = Checkpoint filename 182 * spawn_child = true if the restarted process should be forked as a new process, 183 * in which case 'child_pid' will be returned. 184 * false if the restarted process should overwrite the current 185 * process space. 186 * child_pid = PID of the child that was started, if applicable 187 * 188 */ 189 typedef int (*opal_crs_base_module_restart_fn_t) 190 (opal_crs_base_snapshot_t *snapshot, 191 bool spawn_child, 192 pid_t *child_pid); 193 194 /** 195 * Disable the checkpointer 196 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR 197 * 198 * This should set a flag/mutex to disallow checkpoints to occur. 199 * If a checkpoint were to occur while checkpoints are disabled, 200 * they should block until reenabled. 201 * A quality module implementation would notify the user that the 202 * checkpoint has been delayed until the program is out of this critical 203 * section of code. 204 */ 205 typedef int (*opal_crs_base_module_disable_checkpoint_fn_t) 206 (void); 207 208 /** 209 * Enable the checkpointer 210 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR 211 * 212 * This should set a flag/mutex to allow checkpoints to occur 213 */ 214 typedef int (*opal_crs_base_module_enable_checkpoint_fn_t) 215 (void); 216 217 /** 218 * Prepare the CRS component for process launch. 219 * Some CRS components need to take action before the 220 * process is ever launched to do such things as: 221 * - seed the process environment 222 * - LD_PRELOAD 223 * - Analyze the binary before launch 224 * 225 * @param rank Rank of the process to be started 226 * @param app Absolute pathname of argv[0] 227 * @param argv Standard argv-style array, including a final NULL pointer 228 * @param env Standard environ-style array, including a final NULL pointer 229 */ 230 typedef int (*opal_crs_base_module_prelaunch_fn_t) 231 (int32_t rank, 232 char *base_snapshot_dir, 233 char **app, 234 char **cwd, 235 char ***argv, 236 char ***env); 237 238 /** 239 * Register another thread that may call this library. 240 * Some CR systems require that each thread that will call into their library 241 * register individually before doing so. 242 * 243 * Returns OPAL_SUCCESS or OPAL_ERROR 244 */ 245 typedef int (*opal_crs_base_module_reg_thread_fn_t) 246 (void); 247 248 /** 249 * Structure for CRS components. 250 */ 251 struct opal_crs_base_component_2_0_0_t { 252 /** MCA base component */ 253 mca_base_component_t base_version; 254 /** MCA base data */ 255 mca_base_component_data_t base_data; 256 257 /** Verbosity Level */ 258 int verbose; 259 /** Output Handle for opal_output */ 260 int output_handle; 261 /** Default Priority */ 262 int priority; 263 }; 264 typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_2_0_0_t; 265 typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_t; 266 267 /** 268 * Structure for CRS modules 269 */ 270 struct opal_crs_base_module_1_0_0_t { 271 /** Initialization Function */ 272 opal_crs_base_module_init_fn_t crs_init; 273 /** Finalization Function */ 274 opal_crs_base_module_finalize_fn_t crs_finalize; 275 276 /** Checkpoint interface */ 277 opal_crs_base_module_checkpoint_fn_t crs_checkpoint; 278 279 /** Restart Interface */ 280 opal_crs_base_module_restart_fn_t crs_restart; 281 282 /** Disable checkpoints */ 283 opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint; 284 /** Enable checkpoints */ 285 opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint; 286 287 /** Pre Launch */ 288 opal_crs_base_module_prelaunch_fn_t crs_prelaunch; 289 290 /** Per thread registration */ 291 opal_crs_base_module_reg_thread_fn_t crs_reg_thread; 292 }; 293 typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t; 294 typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t; 295 296 OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs; 297 298 /** 299 * Macro for use in components that are of type CRS 300 */ 301 #define OPAL_CRS_BASE_VERSION_2_0_0 \ 302 OPAL_MCA_BASE_VERSION_2_1_0("crs", 2, 0, 0) 303 304 END_C_DECLS 305 306 #endif /* OPAL_CRS_H */ 307