root/orte/runtime/orte_cr.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_cr_init
  2. orte_cr_finalize
  3. orte_cr_coord
  4. orte_cr_coord_pre_ckpt
  5. orte_cr_coord_pre_restart
  6. orte_cr_coord_pre_continue
  7. orte_cr_coord_post_ckpt
  8. orte_cr_coord_post_restart
  9. orte_cr_coord_post_continue
  10. orte_cr_entry_point_init
  11. orte_cr_entry_point_finalize

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2005 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
  13  *                         reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * $HEADER$
  19  */
  20 
  21 /** @file
  22  *
  23  * ORTE Layer Checkpoint/Restart Runtime functions
  24  *
  25  */
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <errno.h>
  31 #ifdef HAVE_UNISTD_H
  32 #include <unistd.h>
  33 #endif  /* HAVE_UNISTD_H */
  34 #ifdef HAVE_FCNTL_H
  35 #include <fcntl.h>
  36 #endif  /* HAVE_FCNTL_H */
  37 #ifdef HAVE_SYS_TYPES_H
  38 #include <sys/types.h>
  39 #endif  /* HAVE_SYS_TYPES_H */
  40 #ifdef HAVE_SYS_STAT_H
  41 #include <sys/stat.h>  /* for mkfifo */
  42 #endif  /* HAVE_SYS_STAT_H */
  43 
  44 #include "opal/util/opal_environ.h"
  45 #include "opal/util/output.h"
  46 #include "opal/util/basename.h"
  47 #include "opal/mca/event/event.h"
  48 #include "opal/mca/crs/crs.h"
  49 #include "opal/mca/crs/base/base.h"
  50 #include "opal/runtime/opal_cr.h"
  51 
  52 #include "orte/runtime/orte_cr.h"
  53 #include "orte/util/proc_info.h"
  54 #include "orte/runtime/orte_globals.h"
  55 
  56 #include "orte/mca/plm/base/base.h"
  57 #include "orte/mca/ess/ess.h"
  58 #include "orte/mca/ess/base/base.h"
  59 #include "orte/mca/routed/base/base.h"
  60 #include "orte/mca/routed/routed.h"
  61 #include "orte/mca/rml/base/base.h"
  62 #include "orte/mca/iof/base/base.h"
  63 #include "orte/mca/snapc/snapc.h"
  64 #include "orte/mca/snapc/base/base.h"
  65 #include "orte/mca/filem/base/base.h"
  66 
  67 /*************
  68  * Local functions
  69  *************/
  70 static int orte_cr_coord_pre_ckpt(void);
  71 static int orte_cr_coord_pre_restart(void);
  72 static int orte_cr_coord_pre_continue(void);
  73 
  74 static int orte_cr_coord_post_ckpt(void);
  75 static int orte_cr_coord_post_restart(void);
  76 static int orte_cr_coord_post_continue(void);
  77 
  78 bool orte_cr_flush_restart_files = true;
  79 
  80 /*************
  81  * Local vars
  82  *************/
  83 static opal_cr_coord_callback_fn_t  prev_coord_callback = NULL;
  84 
  85 static int orte_cr_output = -1;
  86 static int orte_cr_verbose = 0;
  87 
  88 /*
  89  * CR Init
  90  */
  91 int orte_cr_init(void)
  92 {
  93     int ret, exit_status = ORTE_SUCCESS;
  94 
  95     /*
  96      * OPAL Frameworks
  97      */
  98     if (OPAL_SUCCESS != (ret = opal_cr_init() ) ) {
  99         exit_status = ret;
 100         goto cleanup;
 101     }
 102 
 103     /*
 104      * Register MCA Parameters
 105      */
 106     orte_cr_verbose = 0;
 107     (void) mca_base_var_register ("orte", "orte_cr", NULL, "verbose",
 108                                   "Verbose output for the ORTE Checkpoint/Restart functionality",
 109                                   MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 110                                   OPAL_INFO_LVL_8,
 111                                   MCA_BASE_VAR_SCOPE_READONLY,
 112                                   &orte_cr_verbose);
 113 
 114     /*** RHC: This is going to crash-and-burn when the output conversion is
 115      * completed as opal_output will have no idea what opal_cr_output stream means,
 116      * or even worse, will have assigned it to someone else!
 117      */
 118 
 119     if(0 != orte_cr_verbose) {
 120         orte_cr_output = opal_output_open(NULL);
 121         opal_output_set_verbosity(orte_cr_output, orte_cr_verbose);
 122     } else {
 123         orte_cr_output = opal_cr_output;
 124     }
 125 
 126     opal_output_verbose(10, orte_cr_output,
 127                         "orte_cr: init: orte_cr_init()\n");
 128 
 129     /* Init ORTE Entry Point Function */
 130     if( ORTE_SUCCESS != (ret = orte_cr_entry_point_init()) ) {
 131         exit_status = ret;
 132         goto cleanup;
 133     }
 134 
 135     /* Register the ORTE interlevel coordination callback */
 136     opal_cr_reg_coord_callback(orte_cr_coord, &prev_coord_callback);
 137 
 138     /* Typically this is not needed. Individual BTLs will set this as needed */
 139     opal_cr_continue_like_restart = false;
 140     orte_cr_flush_restart_files   = true;
 141 
 142  cleanup:
 143 
 144     return exit_status;
 145 }
 146 
 147 /*
 148  * Finalize
 149  */
 150 int orte_cr_finalize(void)
 151 {
 152     opal_output_verbose(10, orte_cr_output,
 153                         "orte_cr: finalize: orte_cr_finalize()");
 154 
 155     orte_cr_entry_point_finalize();
 156 
 157     /*
 158      * OPAL Frameworks...
 159      */
 160     opal_cr_finalize();
 161 
 162     return ORTE_SUCCESS;
 163 }
 164 
 165 /*
 166  * Interlayer coordination callback
 167  */
 168 int orte_cr_coord(int state)
 169 {
 170     int ret, exit_status = ORTE_SUCCESS;
 171 
 172     opal_output_verbose(10, orte_cr_output,
 173                         "orte_cr: coord: orte_cr_coord(%s)",
 174                         opal_crs_base_state_str((opal_crs_state_type_t)state));
 175 
 176     /*
 177      * Before calling the previous callback, we have the opportunity to
 178      * take action given the state.
 179      */
 180     if(OPAL_CRS_CHECKPOINT == state) {
 181         /* Do Checkpoint Phase work */
 182         orte_cr_coord_pre_ckpt();
 183     }
 184     else if (OPAL_CRS_CONTINUE == state ) {
 185         /* Do Continue Phase work */
 186         orte_cr_coord_pre_continue();
 187     }
 188     else if (OPAL_CRS_RESTART == state ) {
 189         /* Do Restart Phase work */
 190         orte_cr_coord_pre_restart();
 191     }
 192     else if (OPAL_CRS_TERM == state ) {
 193         /* Do Continue Phase work in prep to terminate the application */
 194     }
 195     else {
 196         /* We must have been in an error state from the checkpoint
 197          * recreate everything, as in the Continue Phase
 198          */
 199     }
 200 
 201     /*
 202      * Call the previous callback, which should be OPAL
 203      */
 204     if(OPAL_SUCCESS != (ret = prev_coord_callback(state)) ) {
 205         exit_status = ret;
 206         goto cleanup;
 207     }
 208 
 209 
 210     /*
 211      * After calling the previous callback, we have the opportunity to
 212      * take action given the state to tidy up.
 213      */
 214     if(OPAL_CRS_CHECKPOINT == state) {
 215         /* Do Checkpoint Phase work */
 216         orte_cr_coord_post_ckpt();
 217     }
 218     else if (OPAL_CRS_CONTINUE == state ) {
 219         /* Do Continue Phase work */
 220         orte_cr_coord_post_continue();
 221     }
 222     else if (OPAL_CRS_RESTART == state ) {
 223         /* Do Restart Phase work */
 224         orte_cr_coord_post_restart();
 225     }
 226     else if (OPAL_CRS_TERM == state ) {
 227         /* Do Continue Phase work in prep to terminate the application */
 228     }
 229     else {
 230         /* We must have been in an error state from the checkpoint
 231          * recreate everything, as in the Continue Phase
 232          */
 233     }
 234 
 235  cleanup:
 236     return exit_status;
 237 }
 238 
 239 /*************
 240  * Pre Lower Layer
 241  *************/
 242 static int orte_cr_coord_pre_ckpt(void) {
 243     int ret, exit_status = ORTE_SUCCESS;
 244 
 245     /*
 246      * All the checkpoint heavey lifting in here...
 247      */
 248     opal_output_verbose(10, orte_cr_output,
 249                         "orte_cr: coord_pre_ckpt: orte_cr_coord_pre_ckpt()");
 250 
 251     /*
 252      * Notify the ESS
 253      */
 254     if( NULL != orte_ess.ft_event ) {
 255         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CHECKPOINT))) {
 256             exit_status = ret;
 257             goto cleanup;
 258         }
 259     }
 260 
 261  cleanup:
 262     return exit_status;
 263 }
 264 
 265 static int orte_cr_coord_pre_restart(void) {
 266     /*
 267      * Can not really do much until OPAL is up and running,
 268      * so defer action until the post_restart function.
 269      */
 270     opal_output_verbose(10, orte_cr_output,
 271                         "orte_cr: coord_pre_restart: orte_cr_coord_pre_restart()");
 272 
 273     return ORTE_SUCCESS;
 274 }
 275 
 276 static int orte_cr_coord_pre_continue(void) {
 277     /*
 278      * Can not really do much until OPAL is up and running,
 279      * so defer action until the post_continue function.
 280      */
 281     opal_output_verbose(10, orte_cr_output,
 282                         "orte_cr: coord_pre_continue: orte_cr_coord_pre_continue()");
 283 
 284     return ORTE_SUCCESS;
 285 }
 286 
 287 /*************
 288  * Post Lower Layer
 289  *************/
 290 static int orte_cr_coord_post_ckpt(void) {
 291     /*
 292      * Now that OPAL is shutdown, we really can't do much
 293      * so assume pre_ckpt took care of everything.
 294      */
 295     opal_output_verbose(10, orte_cr_output,
 296                         "orte_cr: coord_post_ckpt: orte_cr_coord_post_ckpt()");
 297 
 298     return ORTE_SUCCESS;
 299 }
 300 
 301 static int orte_cr_coord_post_restart(void) {
 302     int ret, exit_status = ORTE_SUCCESS;
 303     orte_proc_type_t prev_type = ORTE_PROC_TYPE_NONE;
 304     char * tmp_dir = NULL;
 305 
 306     opal_output_verbose(10, orte_cr_output,
 307                         "orte_cr: coord_post_restart: orte_cr_coord_post_restart()");
 308 
 309     /*
 310      * Add the previous session directory for cleanup
 311      */
 312     opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true);
 313     tmp_dir = orte_process_info.jobfam_session_dir;
 314     if( NULL != tmp_dir ) {
 315         opal_crs_base_cleanup_append(tmp_dir, true);
 316         free(tmp_dir);
 317         tmp_dir = NULL;
 318     }
 319 
 320     /*
 321      * Refresh System information
 322      */
 323     prev_type = orte_process_info.proc_type;
 324     if( ORTE_SUCCESS != (ret = orte_proc_info_finalize()) ) {
 325         exit_status = ret;
 326     }
 327 
 328     if( NULL != orte_process_info.my_hnp_uri ) {
 329         free(orte_process_info.my_hnp_uri);
 330         orte_process_info.my_hnp_uri = NULL;
 331     }
 332 
 333     if( NULL != orte_process_info.my_daemon_uri ) {
 334         free(orte_process_info.my_daemon_uri);
 335         orte_process_info.my_daemon_uri = NULL;
 336     }
 337 
 338     if( ORTE_SUCCESS != (ret = orte_proc_info()) ) {
 339         exit_status = ret;
 340     }
 341 
 342     orte_process_info.proc_type = prev_type;
 343     orte_process_info.my_name = *ORTE_NAME_INVALID;
 344 
 345     /*
 346      * Notify the ESS
 347      */
 348     if( NULL != orte_ess.ft_event ) {
 349         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) {
 350             exit_status = ret;
 351             goto cleanup;
 352         }
 353     }
 354 
 355  cleanup:
 356     return exit_status;
 357 }
 358 
 359 static int orte_cr_coord_post_continue(void) {
 360     int ret, exit_status = ORTE_SUCCESS;
 361 
 362     opal_output_verbose(10, orte_cr_output,
 363                         "orte_cr: coord_post_continue: orte_cr_coord_post_continue()\n");
 364 
 365     /*
 366      * Notify the ESS
 367      */
 368     if( NULL != orte_ess.ft_event ) {
 369         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CONTINUE))) {
 370             exit_status = ret;
 371             goto cleanup;
 372         }
 373     }
 374 
 375  cleanup:
 376 
 377     return exit_status;
 378 }
 379 
 380 /*************************************************
 381  * ORTE Entry Point functionality
 382  *************************************************/
 383 int orte_cr_entry_point_init(void)
 384 {
 385 #if 0
 386     /* JJH XXX
 387      * Make sure to finalize the OPAL Entry Point function if it is active.
 388      */
 389     opal_cr_entry_point_finalize();
 390 #endif
 391 
 392     return ORTE_SUCCESS;
 393 }
 394 
 395 int orte_cr_entry_point_finalize(void)
 396 {
 397     /* Nothing to do here... */
 398     return ORTE_SUCCESS;
 399 }
 400 

/* [<][>][^][v][top][bottom][index][help] */