root/opal/mca/crs/crs.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
  14  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
  15  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  16  *                         reserved.
  17  *
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 /**
  25  * @file
  26  *
  27  * Checkpoint and Restart Service (CRS) Interface
  28  *
  29  * General Description:
  30  *
  31  * The OPAL Checkpoint and Restart Service (CRS) has been created to create an
  32  * abstract notion of a single process checkpointer for upper levels to
  33  * incorporate checkpoint/restart calls genericly into their code. This keeps
  34  * the upper levels from becoming too tied to a specfic checkpoint and restart
  35  * implementation.
  36  *
  37  * This interface will change in the future to allow for some additional
  38  * specialized functionality such as memory inclusion/exclusion, explicit
  39  * restarting while running, and others.
  40  *
  41  * Words to the Wise:
  42  *
  43  * The CRS module must adhere to the API exactly inorder to be fully supported.
  44  * How the module goes about conforming to the API is an internal module issue
  45  * and in no cases should the module impose restrictions upon the upper layers
  46  * as this is an API violation.
  47  *
  48  */
  49 
  50 #ifndef MCA_CRS_H
  51 #define MCA_CRS_H
  52 
  53 #include "opal_config.h"
  54 #include "opal/mca/mca.h"
  55 #include "opal/mca/base/base.h"
  56 #include "opal/class/opal_object.h"
  57 
  58 BEGIN_C_DECLS
  59 
  60 /**
  61  * States of the module
  62  */
  63 enum opal_crs_state_type_t {
  64     OPAL_CRS_NONE        = 0,
  65     OPAL_CRS_CHECKPOINT  = 1,
  66     OPAL_CRS_RESTART_PRE = 2,
  67     OPAL_CRS_RESTART     = 3, /* RESTART_POST */
  68     OPAL_CRS_CONTINUE    = 4,
  69     OPAL_CRS_TERM        = 5,
  70     OPAL_CRS_RUNNING     = 6,
  71     OPAL_CRS_ERROR       = 7,
  72     OPAL_CRS_STATE_MAX   = 8
  73 };
  74 typedef enum opal_crs_state_type_t opal_crs_state_type_t;
  75 
  76 /*
  77  * Possible checkpoint options
  78  */
  79 struct opal_crs_base_ckpt_options_1_0_0_t {
  80     /** Parent is an object type */
  81     opal_object_t super;
  82 
  83     /** Terminate after checkpoint */
  84     bool term;
  85     /** Send SIGSTOP after checkpoint */
  86     bool stop;
  87 
  88     /** INC Prep Only */
  89     bool inc_prep_only;
  90 
  91     /** INC Recover Only */
  92     bool inc_recover_only;
  93 
  94 #if OPAL_ENABLE_CRDEBUG == 1
  95     /** Wait for debugger to attach after checkpoint */
  96     bool attach_debugger;
  97     /** Do not wait for debugger to reattach after checkpoint */
  98     bool detach_debugger;
  99 #endif
 100 };
 101 typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_1_0_0_t;
 102 typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_t;
 103 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_ckpt_options_t);
 104 
 105 /**
 106  * Structure for Single process snapshot
 107  * Each component is assumed to have extened this definition
 108  * in the same way they exten the opal_crs_base_compoinent_t below.
 109  */
 110 struct opal_crs_base_snapshot_1_0_0_t {
 111     /** This is an object, so must have super */
 112     opal_list_item_t super;
 113 
 114     /** MCA Component name */
 115     char * component_name;
 116 
 117     /** Metadata filename */
 118     char * metadata_filename;
 119 
 120     /** Metadata fd */
 121     FILE * metadata;
 122 
 123     /** Absolute path the the snapshot directory */
 124     char * snapshot_directory;
 125 
 126     /** Cold Start:
 127      * If we are restarting cold, then we need to recreate this structure
 128      *  opal_restart would set this, and let the component do the heavy lifting
 129      *  of recreating the structure, sicne it doesn't know exactly how to.
 130      */
 131     bool cold_start;
 132 };
 133 typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t;
 134 typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t;
 135 
 136 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t);
 137 
 138 /**
 139  * Module initialization function.
 140  * Returns OPAL_SUCCESS
 141  */
 142 typedef int (*opal_crs_base_module_init_fn_t)
 143      (void);
 144 
 145 /**
 146  * Module finalization function.
 147  * Returns OPAL_SUCCESS
 148  */
 149 typedef int (*opal_crs_base_module_finalize_fn_t)
 150      (void);
 151 
 152 /**
 153  * Call the underlying checkpointer.
 154  * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
 155  *
 156  * Arguments:
 157  *   pid    = PID of the process to checkpoint, or 0 if checkpointing self.
 158  *   fname  = the filename where the checkpoint has been written.
 159  *   state = The state at which the checkpoint is exiting
 160  *     - OPAL_CRS_CONTINUE
 161  *       Continuing after a checkpoint has been taken
 162  *     - OPAL_CRS_RESTART
 163  *       Restarting from a checkpoint
 164  *     - OPAL_CRS_ERROR
 165  *       Checkpoint was not successful.
 166  *
 167  * The 'fname' string is owned by the caller: if appropriate, it must be eventually
 168  * freed by the caller.
 169  */
 170 typedef int (*opal_crs_base_module_checkpoint_fn_t)
 171      (pid_t pid,
 172       opal_crs_base_snapshot_t *snapshot,
 173       opal_crs_base_ckpt_options_t *options,
 174       opal_crs_state_type_t *state);
 175 
 176 /**
 177  * Call the underlying restart command for this process
 178  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 179  *
 180  * Arguments:
 181  *  fname = Checkpoint filename
 182  *  spawn_child  = true if the restarted process should be forked as a new process,
 183  *                      in which case 'child_pid' will be returned.
 184  *                 false if the restarted process should overwrite the current
 185  *                       process space.
 186  *  child_pid = PID of the child that was started, if applicable
 187  *
 188  */
 189 typedef int (*opal_crs_base_module_restart_fn_t)
 190      (opal_crs_base_snapshot_t *snapshot,
 191       bool spawn_child,
 192       pid_t *child_pid);
 193 
 194 /**
 195  * Disable the checkpointer
 196  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 197  *
 198  * This should set a flag/mutex to disallow checkpoints to occur.
 199  * If a checkpoint were to occur while checkpoints are disabled,
 200  * they should block until reenabled.
 201  * A quality module implementation would notify the user that the
 202  * checkpoint has been delayed until the program is out of this critical
 203  * section of code.
 204  */
 205 typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
 206      (void);
 207 
 208 /**
 209  * Enable the checkpointer
 210  * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 211  *
 212  * This should set a flag/mutex to allow checkpoints to occur
 213  */
 214 typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
 215      (void);
 216 
 217 /**
 218  * Prepare the CRS component for process launch.
 219  * Some CRS components need to take action before the
 220  * process is ever launched to do such things as:
 221  * - seed the process environment
 222  * - LD_PRELOAD
 223  * - Analyze the binary before launch
 224  *
 225  * @param rank Rank of the process to be started
 226  * @param app  Absolute pathname of argv[0]
 227  * @param argv Standard argv-style array, including a final NULL pointer
 228  * @param env  Standard environ-style array, including a final NULL pointer
 229  */
 230 typedef int (*opal_crs_base_module_prelaunch_fn_t)
 231          (int32_t rank,
 232           char *base_snapshot_dir,
 233           char **app,
 234           char **cwd,
 235           char ***argv,
 236           char ***env);
 237 
 238 /**
 239  * Register another thread that may call this library.
 240  * Some CR systems require that each thread that will call into their library
 241  * register individually before doing so.
 242  *
 243  * Returns OPAL_SUCCESS or OPAL_ERROR
 244  */
 245 typedef int (*opal_crs_base_module_reg_thread_fn_t)
 246      (void);
 247 
 248 /**
 249  * Structure for CRS components.
 250  */
 251 struct opal_crs_base_component_2_0_0_t {
 252     /** MCA base component */
 253     mca_base_component_t base_version;
 254     /** MCA base data */
 255     mca_base_component_data_t base_data;
 256 
 257     /** Verbosity Level */
 258     int verbose;
 259     /** Output Handle for opal_output */
 260     int output_handle;
 261     /** Default Priority */
 262     int priority;
 263 };
 264 typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_2_0_0_t;
 265 typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_t;
 266 
 267 /**
 268  * Structure for CRS modules
 269  */
 270 struct opal_crs_base_module_1_0_0_t {
 271     /** Initialization Function */
 272     opal_crs_base_module_init_fn_t           crs_init;
 273     /** Finalization Function */
 274     opal_crs_base_module_finalize_fn_t       crs_finalize;
 275 
 276     /** Checkpoint interface */
 277     opal_crs_base_module_checkpoint_fn_t     crs_checkpoint;
 278 
 279     /** Restart Interface */
 280     opal_crs_base_module_restart_fn_t        crs_restart;
 281 
 282     /** Disable checkpoints */
 283     opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
 284     /** Enable checkpoints */
 285     opal_crs_base_module_enable_checkpoint_fn_t  crs_enable_checkpoint;
 286 
 287     /** Pre Launch */
 288     opal_crs_base_module_prelaunch_fn_t      crs_prelaunch;
 289 
 290     /** Per thread registration */
 291     opal_crs_base_module_reg_thread_fn_t      crs_reg_thread;
 292 };
 293 typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
 294 typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;
 295 
 296 OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;
 297 
 298 /**
 299  * Macro for use in components that are of type CRS
 300  */
 301 #define OPAL_CRS_BASE_VERSION_2_0_0 \
 302     OPAL_MCA_BASE_VERSION_2_1_0("crs", 2, 0, 0)
 303 
 304 END_C_DECLS
 305 
 306 #endif /* OPAL_CRS_H */
 307 

/* [<][>][^][v][top][bottom][index][help] */