root/opal/runtime/opal_cr.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2005 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2017      IBM Corporation. All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * $HEADER$
  19  */
  20 
  21 /**
  22  * @file
  23  *
  24  * Checkpoint functionality for Open MPI
  25  */
  26 
  27 #include "opal_config.h"
  28 #include "opal/mca/crs/crs.h"
  29 #include "opal/mca/event/event.h"
  30 #include "opal/util/output.h"
  31 #include "opal/prefetch.h"
  32 
  33 #ifndef OPAL_CR_H
  34 #define OPAL_CR_H
  35 
  36 
  37 BEGIN_C_DECLS
  38 
  39 /*
  40  * Some defines shared with opal-[checkpoint|restart] commands
  41  */
  42 #define OPAL_CR_DONE       ((char) 0)
  43 #define OPAL_CR_ACK        ((char) 1)
  44 #define OPAL_CR_CHECKPOINT ((char) 2)
  45 #define OPAL_CR_NAMED_PROG_R  ("opal_cr_prog_read")
  46 #define OPAL_CR_NAMED_PROG_W  ("opal_cr_prog_write")
  47 #define OPAL_CR_BASE_ENV_NAME ("opal_cr_restart-env")
  48 
  49 /*
  50  * Possible responses to a checkpoint request from opal-checkpoint
  51  */
  52 enum opal_cr_ckpt_cmd_state_t {
  53     OPAL_CHECKPOINT_CMD_START,       /* Checkpoint is starting on this request */
  54     OPAL_CHECKPOINT_CMD_IN_PROGRESS, /* Checkpoint is currently running */
  55     OPAL_CHECKPOINT_CMD_NULL,        /* Checkpoint cannot be started because it is not supported */
  56     OPAL_CHECKPOINT_CMD_ERROR,       /* An error occurred such that the checkpoint cannot be completed */
  57     /* State of the checkpoint operation */
  58     OPAL_CR_STATUS_NONE,       /* No checkpoint in progress */
  59     OPAL_CR_STATUS_REQUESTED,  /* Checkpoint has been requested */
  60     OPAL_CR_STATUS_RUNNING,    /* Checkpoint is currently running */
  61     OPAL_CR_STATUS_TERM,       /* Checkpoint is running and will terminate process upon completion */
  62     /* State of the continue operation */
  63     OPAL_CR_STATUS_CONTINUE,
  64     /* State of the restart operation */
  65     OPAL_CR_STATUS_RESTART_PRE,
  66     OPAL_CR_STATUS_RESTART_POST
  67 };
  68 typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
  69 
  70     /* An output handle to be used by the cr runtime
  71      * functionality as an argument to opal_output() */
  72     OPAL_DECLSPEC extern int    opal_cr_output;
  73 
  74     /* Directory containing the named pipes for communication
  75      * with the opal-checkpoint tool  */
  76     OPAL_DECLSPEC extern char * opal_cr_pipe_dir;
  77 
  78     /* Signal that opal-checkpoint uses to contact the
  79      * application process */
  80     OPAL_DECLSPEC extern int    opal_cr_entry_point_signal;
  81 
  82     /* If Checkpointing is enabled in this application */
  83     OPAL_DECLSPEC extern bool   opal_cr_is_enabled;
  84 
  85     /* If the application running is a tool
  86      * (e.g., opal-checkpoint, orted, ...) */
  87     OPAL_DECLSPEC extern bool   opal_cr_is_tool;
  88 
  89     /* If a checkpoint has been requested */
  90     OPAL_DECLSPEC extern int opal_cr_checkpoint_request;
  91 
  92     /* The current state of a checkpoint operation */
  93     OPAL_DECLSPEC extern int opal_cr_checkpointing_state;
  94 
  95     /*
  96      * If one of the BTLs that shutdown require a full, clean rebuild of the
  97      * point-to-point stack on 'continue' as well as 'restart'.
  98      */
  99     OPAL_DECLSPEC extern bool opal_cr_continue_like_restart;
 100 
 101 #if OPAL_ENABLE_CRDEBUG == 1
 102     /* Whether or not C/R Debugging is enabled for this process */
 103     OPAL_DECLSPEC extern int MPIR_debug_with_checkpoint;
 104 
 105     /*
 106      * Set/clear the current thread id for the checkpointing thread
 107      */
 108     OPAL_DECLSPEC int opal_cr_debug_set_current_ckpt_thread_self(void);
 109     OPAL_DECLSPEC int opal_cr_debug_clear_current_ckpt_thread(void);
 110 
 111     /*
 112      * This MPI Debugger function needs to be accessed here and have a specific
 113      * name. Thus we are breaking the traditional naming conventions to provide this functionality.
 114      */
 115     OPAL_DECLSPEC int MPIR_checkpoint_debugger_detach(void);
 116 
 117     /**
 118      * A tight loop to wait for debugger to release this process from the
 119      * breakpoint.
 120      */
 121     OPAL_DECLSPEC void *MPIR_checkpoint_debugger_breakpoint(void);
 122 
 123     /**
 124      * A function for the debugger or CRS to force all threads into
 125      */
 126     OPAL_DECLSPEC void *MPIR_checkpoint_debugger_waitpoint(void);
 127 
 128     /**
 129      * A signal handler to force all threads to wait when debugger detaches
 130      */
 131     OPAL_DECLSPEC void MPIR_checkpoint_debugger_signal_handler(int signo);
 132 #endif
 133 
 134     /*
 135      * Refresh environment variables after a restart
 136      */
 137     OPAL_DECLSPEC int opal_cr_refresh_environ(int prev_pid);
 138 
 139     /*
 140      * If this is an application that doesn't want to have
 141      * a notification callback installed, set this to false.
 142      * To see the effect, this must be called before opal_cr_init().
 143      * Default: Enabled
 144      */
 145     OPAL_DECLSPEC int opal_cr_set_enabled(bool);
 146 
 147     /**
 148      * Initialize the notification and coordination
 149      *  elements.
 150      */
 151     OPAL_DECLSPEC int opal_cr_init(void);
 152 
 153     /**
 154      * Finalize the notification and coordination
 155      *  elements.
 156      */
 157     OPAL_DECLSPEC int opal_cr_finalize(void);
 158 
 159     /*************************************************
 160      * Check to see if a checkpoint has been requested
 161      *
 162      * When the checkpoint thread is disabled:
 163      *   This will be checked whenever the MPI Library
 164      *   is entered by the application. It will stop
 165      *   the application for the duration of the entire
 166      *   checkpoint.
 167      * When the checkpoint thread is enabled:
 168      *   The request is handled in the thread parallel
 169      *   with the execution of the program regardless
 170      *   of where the program is in exection.
 171      *   The problem with this method is that it
 172      *   requires the support of progress threads
 173      *   which is currently not working properly :/
 174      *
 175      *************************************************/
 176     OPAL_DECLSPEC void opal_cr_test_if_checkpoint_ready(void);
 177 
 178     /* If the checkpoint operation should be stalled to
 179      * wait for another sevice to complete before
 180      * continuing with the checkpoint */
 181     OPAL_DECLSPEC extern bool opal_cr_stall_check;
 182     OPAL_DECLSPEC extern bool opal_cr_currently_stalled;
 183 
 184 #if OPAL_ENABLE_FT_THREAD == 1
 185     /* Some thread functions */
 186     OPAL_DECLSPEC void opal_cr_thread_init_library(void);
 187     OPAL_DECLSPEC void opal_cr_thread_finalize_library(void);
 188     OPAL_DECLSPEC void opal_cr_thread_abort_library(void);
 189     OPAL_DECLSPEC void opal_cr_thread_enter_library(void);
 190     OPAL_DECLSPEC void opal_cr_thread_exit_library(void);
 191     OPAL_DECLSPEC void opal_cr_thread_noop_progress(void);
 192 #endif /* OPAL_ENABLE_FT_THREAD == 1 */
 193 
 194     /*
 195      * If not using FT then make the #defines noops
 196      */
 197 #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0
 198 #define OPAL_CR_TEST_CHECKPOINT_READY() ;
 199 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL() ;
 200 #define OPAL_CR_INIT_LIBRARY() ;
 201 #define OPAL_CR_FINALIZE_LIBRARY() ;
 202 #define OPAL_CR_ABORT_LIBRARY() ;
 203 #define OPAL_CR_ENTER_LIBRARY() ;
 204 #define OPAL_CR_EXIT_LIBRARY() ;
 205 #define OPAL_CR_NOOP_PROGRESS() ;
 206 #endif /* #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0 */
 207 
 208     /*
 209      * If using FT
 210      */
 211 #if OPAL_ENABLE_FT_CR == 1
 212 #define OPAL_CR_TEST_CHECKPOINT_READY()      \
 213   {                                          \
 214     if(OPAL_UNLIKELY(opal_cr_is_enabled) ) { \
 215       opal_cr_test_if_checkpoint_ready();    \
 216     }                                        \
 217   }
 218 
 219 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL()        \
 220   {                                                  \
 221     if(OPAL_UNLIKELY(opal_cr_is_enabled && !opal_cr_stall_check)) { \
 222       opal_cr_test_if_checkpoint_ready();            \
 223     }                                                \
 224   }
 225 
 226 /* If *not* using FT thread */
 227 #if OPAL_ENABLE_FT_THREAD == 0
 228 #define OPAL_CR_INIT_LIBRARY()     OPAL_CR_TEST_CHECKPOINT_READY();
 229 #define OPAL_CR_FINALIZE_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
 230 #define OPAL_CR_ABORT_LIBRARY()    OPAL_CR_TEST_CHECKPOINT_READY();
 231 #define OPAL_CR_ENTER_LIBRARY()    OPAL_CR_TEST_CHECKPOINT_READY();
 232 #define OPAL_CR_EXIT_LIBRARY()     OPAL_CR_TEST_CHECKPOINT_READY();
 233 #define OPAL_CR_NOOP_PROGRESS()    OPAL_CR_TEST_CHECKPOINT_READY();
 234 #endif /* OPAL_ENABLE_FT_THREAD == 0 */
 235 
 236 /* If using FT thread */
 237 #if OPAL_ENABLE_FT_THREAD == 1
 238 #define OPAL_CR_INIT_LIBRARY()    \
 239  {                                \
 240    opal_cr_thread_init_library(); \
 241  }
 242 #define OPAL_CR_FINALIZE_LIBRARY()    \
 243  {                                    \
 244    opal_cr_thread_finalize_library(); \
 245  }
 246 #define OPAL_CR_ABORT_LIBRARY()    \
 247  {                                 \
 248    opal_cr_thread_abort_library(); \
 249  }
 250 #define OPAL_CR_ENTER_LIBRARY()    \
 251  {                                 \
 252    opal_cr_thread_enter_library(); \
 253  }
 254 #define OPAL_CR_EXIT_LIBRARY()    \
 255  {                                \
 256    opal_cr_thread_exit_library(); \
 257  }
 258 #define OPAL_CR_NOOP_PROGRESS()    \
 259  {                                 \
 260    opal_cr_thread_noop_progress(); \
 261  }
 262 #endif /* OPAL_ENABLE_FT_THREAD == 1 */
 263 
 264 #endif /* OPAL_ENABLE_FT_CR == 1 */
 265 
 266     /*******************************
 267      * Notification Routines
 268      *******************************/
 269     /*******************************
 270      * Notification Routines
 271      *******************************/
 272     /**
 273      * A function to respond to the async checkpoint request
 274      * this is useful when figuring out who should respond
 275      * when stalling.
 276      */
 277     typedef int (*opal_cr_notify_callback_fn_t) (opal_cr_ckpt_cmd_state_t);
 278 
 279     OPAL_DECLSPEC int opal_cr_reg_notify_callback
 280     (opal_cr_notify_callback_fn_t new_func,
 281      opal_cr_notify_callback_fn_t *prev_func);
 282 
 283     /**
 284      * Function to go through the INC
 285      * - Call Registered INC_Coord(CHECKPOINT)
 286      * - Call the CRS.checkpoint()
 287      * - Call Registered INC_Coord(state)
 288      */
 289     OPAL_DECLSPEC int opal_cr_inc_core(pid_t pid,
 290                                        opal_crs_base_snapshot_t *snapshot,
 291                                        opal_crs_base_ckpt_options_t *options,
 292                                        int *state);
 293 
 294     OPAL_DECLSPEC int opal_cr_inc_core_prep(void);
 295     OPAL_DECLSPEC int opal_cr_inc_core_ckpt(pid_t pid,
 296                                             opal_crs_base_snapshot_t *snapshot,
 297                                             opal_crs_base_ckpt_options_t *options,
 298                                             int *state);
 299     OPAL_DECLSPEC int opal_cr_inc_core_recover(int state);
 300 
 301 
 302     /*******************************
 303      * User Coordination Routines
 304      *******************************/
 305     typedef enum {
 306         OPAL_CR_INC_PRE_CRS_PRE_MPI   = 0,
 307         OPAL_CR_INC_PRE_CRS_POST_MPI  = 1,
 308         OPAL_CR_INC_CRS_PRE_CKPT      = 2,
 309         OPAL_CR_INC_CRS_POST_CKPT     = 3,
 310         OPAL_CR_INC_POST_CRS_PRE_MPI  = 4,
 311         OPAL_CR_INC_POST_CRS_POST_MPI = 5,
 312         OPAL_CR_INC_MAX               = 6
 313     } opal_cr_user_inc_callback_event_t;
 314 
 315     typedef enum {
 316         OPAL_CR_INC_STATE_PREPARE  = 0,
 317         OPAL_CR_INC_STATE_CONTINUE = 1,
 318         OPAL_CR_INC_STATE_RESTART  = 2,
 319         OPAL_CR_INC_STATE_ERROR    = 3
 320     } opal_cr_user_inc_callback_state_t;
 321 
 322     /**
 323      * User coordination callback routine
 324      */
 325     typedef int (*opal_cr_user_inc_callback_fn_t)(opal_cr_user_inc_callback_event_t event,
 326                                                   opal_cr_user_inc_callback_state_t state);
 327 
 328     OPAL_DECLSPEC int opal_cr_user_inc_register_callback
 329                       (opal_cr_user_inc_callback_event_t event,
 330                        opal_cr_user_inc_callback_fn_t  function,
 331                        opal_cr_user_inc_callback_fn_t  *prev_function);
 332 
 333     OPAL_DECLSPEC int ompi_trigger_user_inc_callback(opal_cr_user_inc_callback_event_t event,
 334                                                 opal_cr_user_inc_callback_state_t state);
 335 
 336 
 337     /*******************************
 338      * Coordination Routines
 339      *******************************/
 340     /**
 341      * Coordination callback routine signature
 342      */
 343     typedef int (*opal_cr_coord_callback_fn_t) (int);
 344 
 345     /**
 346      * Register a checkpoint coodination routine
 347      * for a higher level.
 348      */
 349      OPAL_DECLSPEC int opal_cr_reg_coord_callback
 350      (opal_cr_coord_callback_fn_t  new_func,
 351       opal_cr_coord_callback_fn_t *prev_func);
 352 
 353     /**
 354      * OPAL Checkpoint Coordination Routine
 355      */
 356     OPAL_DECLSPEC int opal_cr_coord(int state);
 357 
 358     /**
 359      * Checkpoint life-cycle timing
 360      */
 361     OPAL_DECLSPEC void opal_cr_set_time(int idx);
 362     OPAL_DECLSPEC void opal_cr_display_all_timers(void);
 363     OPAL_DECLSPEC void opal_cr_clear_timers(void);
 364 
 365     OPAL_DECLSPEC extern bool opal_cr_timing_enabled;
 366     OPAL_DECLSPEC extern bool opal_cr_timing_barrier_enabled;
 367     OPAL_DECLSPEC extern int  opal_cr_timing_my_rank;
 368     OPAL_DECLSPEC extern int  opal_cr_timing_target_rank;
 369 
 370 
 371 #define OPAL_CR_TIMER_ENTRY0    0
 372 #define OPAL_CR_TIMER_ENTRY1    1
 373 #define OPAL_CR_TIMER_ENTRY2    2
 374 #define OPAL_CR_TIMER_CRCPBR0   3
 375 #define OPAL_CR_TIMER_CRCP0     4
 376 #define OPAL_CR_TIMER_CRCPBR1   5
 377 #define OPAL_CR_TIMER_P2P0      6
 378 #define OPAL_CR_TIMER_P2P1      7
 379 #define OPAL_CR_TIMER_P2PBR0    8
 380 #define OPAL_CR_TIMER_CORE0     9
 381 #define OPAL_CR_TIMER_CORE1    10
 382 #define OPAL_CR_TIMER_COREBR0  11
 383 #define OPAL_CR_TIMER_P2P2     12
 384 #define OPAL_CR_TIMER_P2PBR1   13
 385 #define OPAL_CR_TIMER_P2P3     14
 386 #define OPAL_CR_TIMER_P2PBR2   15
 387 #define OPAL_CR_TIMER_CRCP1    16
 388 #define OPAL_CR_TIMER_COREBR1  17
 389 #define OPAL_CR_TIMER_CORE2    18
 390 #define OPAL_CR_TIMER_ENTRY3   19
 391 #define OPAL_CR_TIMER_ENTRY4   20
 392 #define OPAL_CR_TIMER_MAX      21
 393 
 394 
 395 #define OPAL_CR_CLEAR_TIMERS()                          \
 396     {                                                   \
 397         if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
 398             opal_cr_clear_timers();                     \
 399         }                                               \
 400     }
 401 
 402 #define OPAL_CR_SET_TIMER(idx)                          \
 403     {                                                   \
 404         if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
 405             opal_cr_set_time(idx);                      \
 406         }                                               \
 407     }
 408 
 409 #define OPAL_CR_DISPLAY_ALL_TIMERS()                    \
 410     {                                                   \
 411         if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
 412             opal_cr_display_all_timers();               \
 413         }                                               \
 414     }
 415 
 416 END_C_DECLS
 417 
 418 #endif /* OPAL_CR_H */
 419 

/* [<][>][^][v][top][bottom][index][help] */