This source file includes following definitions.
- orte_cr_init
- orte_cr_finalize
- orte_cr_coord
- orte_cr_coord_pre_ckpt
- orte_cr_coord_pre_restart
- orte_cr_coord_pre_continue
- orte_cr_coord_post_ckpt
- orte_cr_coord_post_restart
- orte_cr_coord_post_continue
- orte_cr_entry_point_init
- orte_cr_entry_point_finalize
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <errno.h>
  31 #ifdef HAVE_UNISTD_H
  32 #include <unistd.h>
  33 #endif  
  34 #ifdef HAVE_FCNTL_H
  35 #include <fcntl.h>
  36 #endif  
  37 #ifdef HAVE_SYS_TYPES_H
  38 #include <sys/types.h>
  39 #endif  
  40 #ifdef HAVE_SYS_STAT_H
  41 #include <sys/stat.h>  
  42 #endif  
  43 
  44 #include "opal/util/opal_environ.h"
  45 #include "opal/util/output.h"
  46 #include "opal/util/basename.h"
  47 #include "opal/mca/event/event.h"
  48 #include "opal/mca/crs/crs.h"
  49 #include "opal/mca/crs/base/base.h"
  50 #include "opal/runtime/opal_cr.h"
  51 
  52 #include "orte/runtime/orte_cr.h"
  53 #include "orte/util/proc_info.h"
  54 #include "orte/runtime/orte_globals.h"
  55 
  56 #include "orte/mca/plm/base/base.h"
  57 #include "orte/mca/ess/ess.h"
  58 #include "orte/mca/ess/base/base.h"
  59 #include "orte/mca/routed/base/base.h"
  60 #include "orte/mca/routed/routed.h"
  61 #include "orte/mca/rml/base/base.h"
  62 #include "orte/mca/iof/base/base.h"
  63 #include "orte/mca/snapc/snapc.h"
  64 #include "orte/mca/snapc/base/base.h"
  65 #include "orte/mca/filem/base/base.h"
  66 
  67 
  68 
  69 
  70 static int orte_cr_coord_pre_ckpt(void);
  71 static int orte_cr_coord_pre_restart(void);
  72 static int orte_cr_coord_pre_continue(void);
  73 
  74 static int orte_cr_coord_post_ckpt(void);
  75 static int orte_cr_coord_post_restart(void);
  76 static int orte_cr_coord_post_continue(void);
  77 
  78 bool orte_cr_flush_restart_files = true;
  79 
  80 
  81 
  82 
  83 static opal_cr_coord_callback_fn_t  prev_coord_callback = NULL;
  84 
  85 static int orte_cr_output = -1;
  86 static int orte_cr_verbose = 0;
  87 
  88 
  89 
  90 
  91 int orte_cr_init(void)
  92 {
  93     int ret, exit_status = ORTE_SUCCESS;
  94 
  95     
  96 
  97 
  98     if (OPAL_SUCCESS != (ret = opal_cr_init() ) ) {
  99         exit_status = ret;
 100         goto cleanup;
 101     }
 102 
 103     
 104 
 105 
 106     orte_cr_verbose = 0;
 107     (void) mca_base_var_register ("orte", "orte_cr", NULL, "verbose",
 108                                   "Verbose output for the ORTE Checkpoint/Restart functionality",
 109                                   MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 110                                   OPAL_INFO_LVL_8,
 111                                   MCA_BASE_VAR_SCOPE_READONLY,
 112                                   &orte_cr_verbose);
 113 
 114     
 115 
 116 
 117 
 118 
 119     if(0 != orte_cr_verbose) {
 120         orte_cr_output = opal_output_open(NULL);
 121         opal_output_set_verbosity(orte_cr_output, orte_cr_verbose);
 122     } else {
 123         orte_cr_output = opal_cr_output;
 124     }
 125 
 126     opal_output_verbose(10, orte_cr_output,
 127                         "orte_cr: init: orte_cr_init()\n");
 128 
 129     
 130     if( ORTE_SUCCESS != (ret = orte_cr_entry_point_init()) ) {
 131         exit_status = ret;
 132         goto cleanup;
 133     }
 134 
 135     
 136     opal_cr_reg_coord_callback(orte_cr_coord, &prev_coord_callback);
 137 
 138     
 139     opal_cr_continue_like_restart = false;
 140     orte_cr_flush_restart_files   = true;
 141 
 142  cleanup:
 143 
 144     return exit_status;
 145 }
 146 
 147 
 148 
 149 
 150 int orte_cr_finalize(void)
 151 {
 152     opal_output_verbose(10, orte_cr_output,
 153                         "orte_cr: finalize: orte_cr_finalize()");
 154 
 155     orte_cr_entry_point_finalize();
 156 
 157     
 158 
 159 
 160     opal_cr_finalize();
 161 
 162     return ORTE_SUCCESS;
 163 }
 164 
 165 
 166 
 167 
 168 int orte_cr_coord(int state)
 169 {
 170     int ret, exit_status = ORTE_SUCCESS;
 171 
 172     opal_output_verbose(10, orte_cr_output,
 173                         "orte_cr: coord: orte_cr_coord(%s)",
 174                         opal_crs_base_state_str((opal_crs_state_type_t)state));
 175 
 176     
 177 
 178 
 179 
 180     if(OPAL_CRS_CHECKPOINT == state) {
 181         
 182         orte_cr_coord_pre_ckpt();
 183     }
 184     else if (OPAL_CRS_CONTINUE == state ) {
 185         
 186         orte_cr_coord_pre_continue();
 187     }
 188     else if (OPAL_CRS_RESTART == state ) {
 189         
 190         orte_cr_coord_pre_restart();
 191     }
 192     else if (OPAL_CRS_TERM == state ) {
 193         
 194     }
 195     else {
 196         
 197 
 198 
 199     }
 200 
 201     
 202 
 203 
 204     if(OPAL_SUCCESS != (ret = prev_coord_callback(state)) ) {
 205         exit_status = ret;
 206         goto cleanup;
 207     }
 208 
 209 
 210     
 211 
 212 
 213 
 214     if(OPAL_CRS_CHECKPOINT == state) {
 215         
 216         orte_cr_coord_post_ckpt();
 217     }
 218     else if (OPAL_CRS_CONTINUE == state ) {
 219         
 220         orte_cr_coord_post_continue();
 221     }
 222     else if (OPAL_CRS_RESTART == state ) {
 223         
 224         orte_cr_coord_post_restart();
 225     }
 226     else if (OPAL_CRS_TERM == state ) {
 227         
 228     }
 229     else {
 230         
 231 
 232 
 233     }
 234 
 235  cleanup:
 236     return exit_status;
 237 }
 238 
 239 
 240 
 241 
 242 static int orte_cr_coord_pre_ckpt(void) {
 243     int ret, exit_status = ORTE_SUCCESS;
 244 
 245     
 246 
 247 
 248     opal_output_verbose(10, orte_cr_output,
 249                         "orte_cr: coord_pre_ckpt: orte_cr_coord_pre_ckpt()");
 250 
 251     
 252 
 253 
 254     if( NULL != orte_ess.ft_event ) {
 255         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CHECKPOINT))) {
 256             exit_status = ret;
 257             goto cleanup;
 258         }
 259     }
 260 
 261  cleanup:
 262     return exit_status;
 263 }
 264 
 265 static int orte_cr_coord_pre_restart(void) {
 266     
 267 
 268 
 269 
 270     opal_output_verbose(10, orte_cr_output,
 271                         "orte_cr: coord_pre_restart: orte_cr_coord_pre_restart()");
 272 
 273     return ORTE_SUCCESS;
 274 }
 275 
 276 static int orte_cr_coord_pre_continue(void) {
 277     
 278 
 279 
 280 
 281     opal_output_verbose(10, orte_cr_output,
 282                         "orte_cr: coord_pre_continue: orte_cr_coord_pre_continue()");
 283 
 284     return ORTE_SUCCESS;
 285 }
 286 
 287 
 288 
 289 
 290 static int orte_cr_coord_post_ckpt(void) {
 291     
 292 
 293 
 294 
 295     opal_output_verbose(10, orte_cr_output,
 296                         "orte_cr: coord_post_ckpt: orte_cr_coord_post_ckpt()");
 297 
 298     return ORTE_SUCCESS;
 299 }
 300 
 301 static int orte_cr_coord_post_restart(void) {
 302     int ret, exit_status = ORTE_SUCCESS;
 303     orte_proc_type_t prev_type = ORTE_PROC_TYPE_NONE;
 304     char * tmp_dir = NULL;
 305 
 306     opal_output_verbose(10, orte_cr_output,
 307                         "orte_cr: coord_post_restart: orte_cr_coord_post_restart()");
 308 
 309     
 310 
 311 
 312     opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true);
 313     tmp_dir = orte_process_info.jobfam_session_dir;
 314     if( NULL != tmp_dir ) {
 315         opal_crs_base_cleanup_append(tmp_dir, true);
 316         free(tmp_dir);
 317         tmp_dir = NULL;
 318     }
 319 
 320     
 321 
 322 
 323     prev_type = orte_process_info.proc_type;
 324     if( ORTE_SUCCESS != (ret = orte_proc_info_finalize()) ) {
 325         exit_status = ret;
 326     }
 327 
 328     if( NULL != orte_process_info.my_hnp_uri ) {
 329         free(orte_process_info.my_hnp_uri);
 330         orte_process_info.my_hnp_uri = NULL;
 331     }
 332 
 333     if( NULL != orte_process_info.my_daemon_uri ) {
 334         free(orte_process_info.my_daemon_uri);
 335         orte_process_info.my_daemon_uri = NULL;
 336     }
 337 
 338     if( ORTE_SUCCESS != (ret = orte_proc_info()) ) {
 339         exit_status = ret;
 340     }
 341 
 342     orte_process_info.proc_type = prev_type;
 343     orte_process_info.my_name = *ORTE_NAME_INVALID;
 344 
 345     
 346 
 347 
 348     if( NULL != orte_ess.ft_event ) {
 349         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) {
 350             exit_status = ret;
 351             goto cleanup;
 352         }
 353     }
 354 
 355  cleanup:
 356     return exit_status;
 357 }
 358 
 359 static int orte_cr_coord_post_continue(void) {
 360     int ret, exit_status = ORTE_SUCCESS;
 361 
 362     opal_output_verbose(10, orte_cr_output,
 363                         "orte_cr: coord_post_continue: orte_cr_coord_post_continue()\n");
 364 
 365     
 366 
 367 
 368     if( NULL != orte_ess.ft_event ) {
 369         if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CONTINUE))) {
 370             exit_status = ret;
 371             goto cleanup;
 372         }
 373     }
 374 
 375  cleanup:
 376 
 377     return exit_status;
 378 }
 379 
 380 
 381 
 382 
 383 int orte_cr_entry_point_init(void)
 384 {
 385 #if 0
 386     
 387 
 388 
 389     opal_cr_entry_point_finalize();
 390 #endif
 391 
 392     return ORTE_SUCCESS;
 393 }
 394 
 395 int orte_cr_entry_point_finalize(void)
 396 {
 397     
 398     return ORTE_SUCCESS;
 399 }
 400