root/orte/mca/snapc/base/snapc_base_fns.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_snapc_base_local_snapshot_construct
  2. orte_snapc_base_local_snapshot_destruct
  3. orte_snapc_base_global_snapshot_construct
  4. orte_snapc_base_global_snapshot_destruct
  5. orte_snapc_base_quiesce_construct
  6. orte_snapc_base_quiesce_destruct
  7. orte_snapc_base_request_op_construct
  8. orte_snapc_base_request_op_destruct
  9. orte_snapc_base_none_open
  10. orte_snapc_base_none_close
  11. orte_snapc_base_none_query
  12. orte_snapc_base_module_init
  13. orte_snapc_base_module_finalize
  14. orte_snapc_base_none_setup_job
  15. orte_snapc_base_none_release_job
  16. orte_snapc_base_none_ft_event
  17. orte_snapc_base_none_start_ckpt
  18. orte_snapc_base_none_end_ckpt
  19. snapc_none_global_cmdline_request
  20. orte_snapc_ckpt_state_notify
  21. orte_snapc_base_global_coord_ckpt_init_cmd
  22. orte_snapc_base_unpack_options
  23. orte_snapc_base_pack_options
  24. orte_snapc_base_global_coord_ckpt_update_cmd
  25. orte_snapc_ckpt_state_str

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University.
   3  *                         All rights reserved.
   4  * Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
   5  *                         All rights reserved.
   6  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   7  *                         University of Stuttgart.  All rights reserved.
   8  * Copyright (c) 2004-2005 The Regents of the University of California.
   9  *                         All rights reserved.
  10  * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
  11  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  12  * Copyright (c) 2014-2019 Intel, Inc.  All rights reserved.
  13  * $COPYRIGHT$
  14  *
  15  * Additional copyrights may follow
  16  *
  17  * $HEADER$
  18  */
  19 
  20 #include "orte_config.h"
  21 
  22 #include <string.h>
  23 #ifdef HAVE_SYS_TYPES_H
  24 #include <sys/types.h>
  25 #endif  /* HAVE_SYS_TYPES_H */
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif  /* HAVE_UNISTD_H */
  29 #ifdef HAVE_SYS_TYPES_H
  30 #include <sys/types.h>
  31 #endif /* HAVE_SYS_TYPES_H */
  32 #ifdef HAVE_SYS_STAT_H
  33 #include <sys/stat.h>
  34 #endif /* HAVE_SYS_STAT_H */
  35 #ifdef HAVE_DIRENT_H
  36 #include <dirent.h>
  37 #endif /* HAVE_DIRENT_H */
  38 #include <time.h>
  39 
  40 #include "orte/mca/mca.h"
  41 #include "opal/mca/base/base.h"
  42 
  43 #include "opal/util/os_dirpath.h"
  44 #include "opal/util/output.h"
  45 #include "opal/util/show_help.h"
  46 #include "opal/util/basename.h"
  47 #include "opal/util/argv.h"
  48 #include "opal/mca/crs/crs.h"
  49 #include "opal/mca/crs/base/base.h"
  50 #include "opal/dss/dss.h"
  51 
  52 #include "orte/mca/rml/rml.h"
  53 #include "orte/mca/rml/rml_types.h"
  54 #include "orte/mca/errmgr/errmgr.h"
  55 #include "orte/runtime/orte_globals.h"
  56 #include "orte/util/name_fns.h"
  57 
  58 #include "orte/mca/sstore/sstore.h"
  59 #include "orte/mca/sstore/base/base.h"
  60 
  61 #include "orte/mca/snapc/snapc.h"
  62 #include "orte/mca/snapc/base/base.h"
  63 
  64 /******************
  65  * Local Functions
  66  ******************/
  67 size_t orte_snapc_base_snapshot_seq_number = 0;
  68 
  69 /******************
  70  * Object stuff
  71  ******************/
  72 OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
  73                    opal_list_item_t,
  74                    orte_snapc_base_local_snapshot_construct,
  75                    orte_snapc_base_local_snapshot_destruct);
  76 
  77 void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
  78 {
  79     snapshot->process_name.jobid  = 0;
  80     snapshot->process_name.vpid   = 0;
  81 
  82     snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
  83 
  84     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
  85 }
  86 
  87 void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
  88 {
  89     snapshot->process_name.jobid  = 0;
  90     snapshot->process_name.vpid   = 0;
  91 
  92     snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
  93 
  94     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
  95 }
  96 
  97 /****/
  98 OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
  99                    opal_list_item_t,
 100                    orte_snapc_base_global_snapshot_construct,
 101                    orte_snapc_base_global_snapshot_destruct);
 102 
 103 void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
 104 {
 105     OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
 106 
 107     snapshot->options = OBJ_NEW(opal_crs_base_ckpt_options_t);
 108 
 109     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
 110 }
 111 
 112 void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
 113 {
 114     opal_list_item_t* item = NULL;
 115 
 116     while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
 117         OBJ_RELEASE(item);
 118     }
 119     OBJ_DESTRUCT(&(snapshot->local_snapshots));
 120 
 121     if( NULL != snapshot->options ) {
 122         OBJ_RELEASE(snapshot->options);
 123         snapshot->options = NULL;
 124     }
 125 
 126     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
 127 }
 128 
 129 OBJ_CLASS_INSTANCE(orte_snapc_base_quiesce_t,
 130                    opal_object_t,
 131                    orte_snapc_base_quiesce_construct,
 132                    orte_snapc_base_quiesce_destruct);
 133 
 134 void orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t *quiesce)
 135 {
 136     quiesce->epoch         = -1;
 137     quiesce->snapshot      = NULL;
 138     quiesce->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
 139     quiesce->ss_snapshot   = NULL;
 140     quiesce->handle        = NULL;
 141     quiesce->target_dir    = NULL;
 142     quiesce->crs_name      = NULL;
 143     quiesce->cmdline       = NULL;
 144     quiesce->cr_state      = OPAL_CRS_NONE;
 145     quiesce->checkpointing = false;
 146     quiesce->restarting    = false;
 147 
 148     quiesce->migrating     = false;
 149     quiesce->num_migrating = 0;
 150     OBJ_CONSTRUCT(&(quiesce->migrating_procs), opal_pointer_array_t);
 151     opal_pointer_array_init(&(quiesce->migrating_procs), 8, INT32_MAX, 8);
 152 }
 153 
 154 void orte_snapc_base_quiesce_destruct( orte_snapc_base_quiesce_t *quiesce)
 155 {
 156     int i;
 157     void *item = NULL;
 158 
 159     quiesce->epoch = -1;
 160 
 161     if( NULL != quiesce->snapshot ) {
 162         OBJ_RELEASE(quiesce->snapshot);
 163         quiesce->snapshot      = NULL;
 164     }
 165 
 166     quiesce->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
 167     if( NULL != quiesce->ss_snapshot ) {
 168         OBJ_RELEASE(quiesce->ss_snapshot);
 169         quiesce->ss_snapshot   = NULL;
 170     }
 171 
 172     if( NULL != quiesce->handle ) {
 173         free(quiesce->handle);
 174         quiesce->handle = NULL;
 175     }
 176     if( NULL != quiesce->target_dir ) {
 177         free(quiesce->target_dir);
 178         quiesce->target_dir = NULL;
 179     }
 180     if( NULL != quiesce->crs_name ) {
 181         free(quiesce->crs_name);
 182         quiesce->crs_name = NULL;
 183     }
 184     if( NULL != quiesce->cmdline ) {
 185         free(quiesce->cmdline);
 186         quiesce->cmdline = NULL;
 187     }
 188 
 189     quiesce->cr_state      = OPAL_CRS_NONE;
 190     quiesce->checkpointing = false;
 191     quiesce->restarting    = false;
 192 
 193     quiesce->migrating     = false;
 194     quiesce->num_migrating = 0;
 195     for( i = 0; i < quiesce->migrating_procs.size; ++i) {
 196         item = opal_pointer_array_get_item(&(quiesce->migrating_procs), i);
 197         if( NULL != item ) {
 198             OBJ_RELEASE(item);
 199         }
 200     }
 201     OBJ_DESTRUCT(&(quiesce->migrating_procs));
 202 }
 203 
 204 OBJ_CLASS_INSTANCE(orte_snapc_base_request_op_t,
 205                    opal_object_t,
 206                    orte_snapc_base_request_op_construct,
 207                    orte_snapc_base_request_op_destruct);
 208 
 209 void orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t *op)
 210 {
 211     op->event     = ORTE_SNAPC_OP_NONE;
 212     op->is_active = false;
 213     op->leader    = -1;
 214 
 215     op->seq_num       = -1;
 216     op->global_handle = NULL;
 217     op->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
 218 
 219     op->mig_num       = -1;
 220     op->mig_vpids     = NULL;
 221     /*op->mig_host_pref = NULL;*/
 222     op->mig_vpid_pref = NULL;
 223     op->mig_off_node  = NULL;
 224 }
 225 
 226 void orte_snapc_base_request_op_destruct( orte_snapc_base_request_op_t *op)
 227 {
 228     op->event     = ORTE_SNAPC_OP_NONE;
 229     op->is_active = false;
 230     op->leader    = -1;
 231 
 232     op->seq_num       = -1;
 233     if(NULL != op->global_handle ) {
 234         free(op->global_handle);
 235         op->global_handle = NULL;
 236     }
 237 
 238     op->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
 239 
 240     op->mig_num       = -1;
 241     /*
 242     if( NULL != op->mig_vpids ) {
 243         free( op->mig_vpids );
 244         op->mig_vpids = NULL;
 245     }
 246 
 247     if( NULL != op->mig_host_pref ) {
 248         free( op->mig_host_pref );
 249         op->mig_host_pref = NULL;
 250     }
 251 
 252     if( NULL != op->mig_vpid_pref ) {
 253         free( op->mig_vpid_pref );
 254         op->mig_vpid_pref = NULL;
 255     }
 256 
 257     if( NULL != op->mig_off_node ) {
 258         free( op->mig_off_node );
 259         op->mig_off_node = NULL;
 260     }
 261     */
 262 }
 263 
 264 
 265 /***********************
 266  * None component stuff
 267  ************************/
 268 int orte_snapc_base_none_open(void)
 269 {
 270     return ORTE_SUCCESS;
 271 }
 272 
 273 int orte_snapc_base_none_close(void)
 274 {
 275     return ORTE_SUCCESS;
 276 }
 277 
 278 int orte_snapc_base_none_query(mca_base_module_t **module, int *priority)
 279 {
 280     *module = NULL;
 281     *priority = 0;
 282 
 283     return OPAL_SUCCESS;
 284 }
 285 
 286 int orte_snapc_base_module_init(bool seed, bool app)
 287 {
 288     return ORTE_SUCCESS;
 289 }
 290 
 291 int orte_snapc_base_module_finalize(void)
 292 {
 293     return ORTE_SUCCESS;
 294 }
 295 
 296 /* None RML command line response callback */
 297 static void snapc_none_global_cmdline_request(int status,
 298                                               orte_process_name_t* sender,
 299                                               opal_buffer_t *buffer,
 300                                               orte_rml_tag_t tag,
 301                                               void* cbdata);
 302 int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
 303 {
 304 
 305     /*
 306      * Coordinator command listener
 307      */
 308     orte_snapc_base_snapshot_seq_number = -1;
 309     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
 310                             ORTE_RML_TAG_CKPT,
 311                             ORTE_RML_PERSISTENT,
 312                             snapc_none_global_cmdline_request,
 313                             NULL);
 314 
 315     return ORTE_SUCCESS;
 316 }
 317 
 318 int orte_snapc_base_none_release_job(orte_jobid_t jobid)
 319 {
 320     /*
 321      * Remove the checkpoint request callback
 322      */
 323 
 324     return ORTE_SUCCESS;
 325 }
 326 
 327 int orte_snapc_base_none_ft_event(int state)
 328 {
 329     return ORTE_SUCCESS;
 330 }
 331 
 332 int orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t *datum)
 333 {
 334     return ORTE_SUCCESS;
 335 }
 336 
 337 int orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t *datum)
 338 {
 339     return ORTE_SUCCESS;
 340 }
 341 
 342 
 343 /********************
 344  * Local Functions
 345  ********************/
 346 /* None RML response callback */
 347 static void snapc_none_global_cmdline_request(int status,
 348                                               orte_process_name_t* sender,
 349                                               opal_buffer_t *buffer,
 350                                               orte_rml_tag_t tag,
 351                                               void* cbdata)
 352 {
 353     int ret;
 354     orte_snapc_cmd_flag_t command;
 355     orte_std_cntr_t n = 1;
 356     opal_crs_base_ckpt_options_t *options = NULL;
 357     orte_jobid_t jobid;
 358 
 359     options = OBJ_NEW(opal_crs_base_ckpt_options_t);
 360 
 361     n = 1;
 362     if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
 363         ORTE_ERROR_LOG(ret);
 364         goto cleanup;
 365     }
 366 
 367     /*
 368      * orte_checkpoint has requested that a checkpoint be taken
 369      * Respond that a checkpoint cannot be taken at this time
 370      */
 371     if (ORTE_SNAPC_GLOBAL_INIT_CMD == command) {
 372         /*
 373          * Do the basic handshake with the orte_checkpoint command
 374          */
 375         if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, options, &jobid)) ) {
 376             ORTE_ERROR_LOG(ret);
 377             goto cleanup;
 378         }
 379 
 380         /*
 381          * Respond with an invalid response
 382          */
 383         if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, 0, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
 384             ORTE_ERROR_LOG(ret);
 385             goto cleanup;
 386         }
 387     }
 388     /*
 389      * Unknown command
 390      */
 391     else {
 392         ORTE_ERROR_LOG(ret);
 393         goto cleanup;
 394     }
 395 
 396  cleanup:
 397     if( NULL != options ) {
 398         OBJ_RELEASE(options);
 399         options = NULL;
 400     }
 401 
 402     return;
 403 }
 404 
 405 /********************
 406  * Utility functions
 407  ********************/
 408 
 409 /* Report the checkpoint status */
 410 void orte_snapc_ckpt_state_notify(int state)
 411 {
 412     switch(state) {
 413     case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
 414         opal_output(0, "%d: Checkpoint established for process %s.",
 415                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
 416         break;
 417     case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
 418         opal_output(0, "%d: Process %s is not checkpointable.",
 419                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
 420         break;
 421     case ORTE_SNAPC_CKPT_STATE_ERROR:
 422         opal_output(0, "%d: Failed to checkpoint process %s.",
 423                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
 424         break;
 425     case ORTE_SNAPC_CKPT_STATE_RECOVERED:
 426         opal_output(0, "%d: Successfully restarted process %s.",
 427                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
 428         break;
 429     case ORTE_SNAPC_CKPT_STATE_NO_RESTART:
 430         opal_output(0, "%d: Failed to restart process %s.",
 431                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
 432         break;
 433     /* ADK: We currently do not notify for these states, but good to
 434      * have them around anyways. */
 435     case ORTE_SNAPC_CKPT_STATE_NONE:
 436     case ORTE_SNAPC_CKPT_STATE_REQUEST:
 437     case ORTE_SNAPC_CKPT_STATE_PENDING:
 438     case ORTE_SNAPC_CKPT_STATE_RUNNING:
 439     case ORTE_SNAPC_CKPT_STATE_STOPPED:
 440     case ORTE_SNAPC_CKPT_STATE_MIGRATING:
 441     case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
 442     default:
 443         break;
 444     }
 445 }
 446 
 447 int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
 448                                                opal_buffer_t* buffer,
 449                                                opal_crs_base_ckpt_options_t *options,
 450                                                orte_jobid_t *jobid)
 451 {
 452     int ret, exit_status = ORTE_SUCCESS;
 453     orte_std_cntr_t count = 1;
 454     orte_ns_cmp_bitmask_t mask;
 455 
 456     mask = ORTE_NS_CMP_ALL;
 457 
 458     /*
 459      * Do not send to self, as that is silly.
 460      */
 461     if (OPAL_EQUAL ==
 462             orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
 463         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 464                              "%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
 465                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
 466         return ORTE_SUCCESS;
 467     }
 468 
 469     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 470                          "%s) base:ckpt_init_cmd: Receiving commands\n",
 471                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
 472 
 473     /********************
 474      * Receive command line checkpoint request:
 475      * - Command (already received)
 476      * - options
 477      * - jobid
 478      ********************/
 479     if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) {
 480         opal_output(orte_snapc_base_framework.framework_output,
 481                     "%s) base:ckpt_init_cmd: Error: Unpack (options) Failure (ret = %d)\n",
 482                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
 483         ORTE_ERROR_LOG(ret);
 484         exit_status = ret;
 485         goto cleanup;
 486     }
 487 
 488     count = 1;
 489     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, jobid, &count, ORTE_JOBID)) ) {
 490         opal_output(orte_snapc_base_framework.framework_output,
 491                     "%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
 492                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 493                     ret, __LINE__);
 494         ORTE_ERROR_LOG(ret);
 495         exit_status = ret;
 496         goto cleanup;
 497     }
 498 
 499     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 500                          "%s) base:ckpt_init_cmd: Received [%d, %d, %s]\n",
 501                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 502                          (int)(options->term),
 503                          (int)(options->stop),
 504                          ORTE_JOBID_PRINT(*jobid)));
 505 
 506  cleanup:
 507     return exit_status;
 508 }
 509 
 510 int orte_snapc_base_unpack_options(opal_buffer_t* buffer,
 511                                    opal_crs_base_ckpt_options_t *options)
 512 {
 513     int ret, exit_status = ORTE_SUCCESS;
 514     orte_std_cntr_t count = 1;
 515 
 516     count = 1;
 517     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->term), &count, OPAL_BOOL)) ) {
 518         opal_output(orte_snapc_base_framework.framework_output,
 519                     "snapc:base:unpack_options: Error: Unpack (term) Failure (ret = %d)\n",
 520                     ret);
 521         ORTE_ERROR_LOG(ret);
 522         exit_status = ret;
 523         goto cleanup;
 524     }
 525 
 526     count = 1;
 527     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->stop), &count, OPAL_BOOL)) ) {
 528         opal_output(orte_snapc_base_framework.framework_output,
 529                     "snapc:base:unpack_options: Error: Unpack (stop) Failure (ret = %d)\n",
 530                     ret);
 531         ORTE_ERROR_LOG(ret);
 532         exit_status = ret;
 533         goto cleanup;
 534     }
 535 
 536     count = 1;
 537     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_prep_only), &count, OPAL_BOOL)) ) {
 538         opal_output(orte_snapc_base_framework.framework_output,
 539                     "snapc:base:unpack_options: Error: Unpack (inc_prep_only) Failure (ret = %d)\n",
 540                     ret);
 541         ORTE_ERROR_LOG(ret);
 542         exit_status = ret;
 543         goto cleanup;
 544     }
 545 
 546     count = 1;
 547     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_recover_only), &count, OPAL_BOOL)) ) {
 548         opal_output(orte_snapc_base_framework.framework_output,
 549                     "snapc:base:unpack_options: Error: Unpack (inc_recover_only) Failure (ret = %d)\n",
 550                     ret);
 551         ORTE_ERROR_LOG(ret);
 552         exit_status = ret;
 553         goto cleanup;
 554     }
 555 
 556 #if OPAL_ENABLE_CRDEBUG == 1
 557     count = 1;
 558     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->attach_debugger), &count, OPAL_BOOL)) ) {
 559         opal_output(orte_snapc_base_framework.framework_output,
 560                     "snapc:base:unpack_options: Error: Unpack (attach_debugger) Failure (ret = %d)\n",
 561                     ret);
 562         ORTE_ERROR_LOG(ret);
 563         exit_status = ret;
 564         goto cleanup;
 565     }
 566 
 567     count = 1;
 568     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->detach_debugger), &count, OPAL_BOOL)) ) {
 569         opal_output(orte_snapc_base_framework.framework_output,
 570                     "snapc:base:unpack_options: Error: Unpack (detach_debugger) Failure (ret = %d)\n",
 571                     ret);
 572         ORTE_ERROR_LOG(ret);
 573         exit_status = ret;
 574         goto cleanup;
 575     }
 576 #endif
 577 
 578  cleanup:
 579     return exit_status;
 580 }
 581 
 582 int orte_snapc_base_pack_options(opal_buffer_t* buffer,
 583                                  opal_crs_base_ckpt_options_t *options)
 584 {
 585     int ret, exit_status = ORTE_SUCCESS;
 586 
 587     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->term), 1, OPAL_BOOL))) {
 588         ORTE_ERROR_LOG(ret);
 589         exit_status = ret;
 590         goto cleanup;
 591     }
 592 
 593     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->stop), 1, OPAL_BOOL))) {
 594         ORTE_ERROR_LOG(ret);
 595         exit_status = ret;
 596         goto cleanup;
 597     }
 598 
 599     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_prep_only), 1, OPAL_BOOL))) {
 600         ORTE_ERROR_LOG(ret);
 601         exit_status = ret;
 602         goto cleanup;
 603     }
 604 
 605     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_recover_only), 1, OPAL_BOOL))) {
 606         ORTE_ERROR_LOG(ret);
 607         exit_status = ret;
 608         goto cleanup;
 609     }
 610 
 611 #if OPAL_ENABLE_CRDEBUG == 1
 612     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->attach_debugger), 1, OPAL_BOOL))) {
 613         ORTE_ERROR_LOG(ret);
 614         exit_status = ret;
 615         goto cleanup;
 616     }
 617 
 618     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->detach_debugger), 1, OPAL_BOOL))) {
 619         ORTE_ERROR_LOG(ret);
 620         exit_status = ret;
 621         goto cleanup;
 622     }
 623 #endif
 624 
 625  cleanup:
 626     return exit_status;
 627 }
 628 
 629 int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
 630                                                  orte_sstore_base_handle_t ss_handle,
 631                                                  int ckpt_status)
 632 {
 633     int ret, exit_status = ORTE_SUCCESS;
 634     opal_buffer_t *loc_buffer = NULL;
 635     orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_UPDATE_CMD;
 636     char *global_snapshot_handle = NULL;
 637     char *tmp_str = NULL;
 638     int seq_num;
 639     orte_ns_cmp_bitmask_t mask;
 640 
 641     /*
 642      * Noop if invalid peer, or peer not specified (JJH Double check this)
 643      */
 644     if( NULL == peer ||
 645         OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
 646         /*return ORTE_ERR_BAD_PARAM;*/
 647         return ORTE_SUCCESS;
 648     }
 649 
 650     mask = ORTE_NS_CMP_ALL;
 651 
 652     /*
 653      * Do not send to self, as that is silly.
 654      */
 655     if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
 656         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 657                              "%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
 658                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
 659         return ORTE_SUCCESS;
 660     }
 661 
 662     /*
 663      * Pass on the checkpoint state.
 664      */
 665     orte_snapc_ckpt_state_notify(ckpt_status);
 666 
 667     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 668                          "%s) base:ckpt_update_cmd: Sending update command <status %d>\n",
 669                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 670                          ckpt_status));
 671 
 672     /********************
 673      * Send over the status of the checkpoint
 674      * - ckpt_state
 675      * - global snapshot handle (upon finish only)
 676      * - sequence number        (upon finish only)
 677      ********************/
 678     if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
 679         exit_status = ORTE_ERROR;
 680         goto cleanup;
 681     }
 682 
 683     if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
 684         ORTE_ERROR_LOG(ret);
 685         exit_status = ret;
 686         OBJ_RELEASE(loc_buffer);
 687         goto cleanup;
 688     }
 689 
 690     if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &ckpt_status, 1, OPAL_INT))) {
 691         opal_output(orte_snapc_base_framework.framework_output,
 692                     "%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
 693                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 694                     ret, __LINE__);
 695         ORTE_ERROR_LOG(ret);
 696         exit_status = ret;
 697         OBJ_RELEASE(loc_buffer);
 698         goto cleanup;
 699     }
 700 
 701     if( ORTE_SNAPC_CKPT_STATE_RECOVERED == ckpt_status ||
 702         ORTE_SNAPC_CKPT_STATE_ESTABLISHED  == ckpt_status ||
 703         ORTE_SNAPC_CKPT_STATE_STOPPED   == ckpt_status ||
 704         ORTE_SNAPC_CKPT_STATE_ERROR     == ckpt_status ) {
 705 
 706         if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
 707             if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
 708                                                             SSTORE_METADATA_GLOBAL_SNAP_REF,
 709                                                             &global_snapshot_handle)) ) {
 710                 opal_output(orte_snapc_base_framework.framework_output,
 711                             "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
 712                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
 713                 ORTE_ERROR_LOG(ret);
 714                 /* Do not exit here, continue so that we can inform the tool
 715                  * that the checkpoint has failed
 716                  */
 717             }
 718 
 719             if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
 720                                                             SSTORE_METADATA_GLOBAL_SNAP_SEQ,
 721                                                             &tmp_str)) ) {
 722                 opal_output(orte_snapc_base_framework.framework_output,
 723                             "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
 724                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
 725                 ORTE_ERROR_LOG(ret);
 726                 /* Do not exit here, continue so that we can inform the tool
 727                  * that the checkpoint has failed
 728                  */
 729             }
 730 
 731             if( NULL != tmp_str ) {
 732                 seq_num = atoi(tmp_str);
 733             } else {
 734                 seq_num = -1;
 735             }
 736         } else {
 737             /* Checkpoint Error Case */
 738             global_snapshot_handle = NULL;
 739             seq_num = -1;
 740         }
 741 
 742         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
 743                              "%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
 744                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 745                              ckpt_status, global_snapshot_handle, seq_num));
 746 
 747         if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &global_snapshot_handle, 1, OPAL_STRING))) {
 748             opal_output(orte_snapc_base_framework.framework_output,
 749                         "%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
 750                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 751                         ret, __LINE__);
 752             ORTE_ERROR_LOG(ret);
 753             exit_status = ret;
 754             OBJ_RELEASE(loc_buffer);
 755             goto cleanup;
 756         }
 757 
 758         if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &seq_num, 1, OPAL_INT))) {
 759             opal_output(orte_snapc_base_framework.framework_output,
 760                         "%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
 761                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 762                         ret, __LINE__);
 763             ORTE_ERROR_LOG(ret);
 764             exit_status = ret;
 765             OBJ_RELEASE(loc_buffer);
 766             goto cleanup;
 767         }
 768     }
 769 
 770     if (0 > (ret = orte_rml.send_buffer_nb(peer, loc_buffer,
 771                                            ORTE_RML_TAG_CKPT,
 772                                            orte_rml_send_callback, NULL))) {
 773         opal_output(orte_snapc_base_framework.framework_output,
 774                     "%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
 775                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
 776                     ret, __LINE__);
 777         ORTE_ERROR_LOG(ret);
 778         exit_status = ret;
 779         OBJ_RELEASE(loc_buffer);
 780         goto cleanup;
 781     }
 782 
 783  cleanup:
 784     if( NULL != global_snapshot_handle ){
 785         free(global_snapshot_handle);
 786         global_snapshot_handle = NULL;
 787     }
 788     if( NULL != tmp_str ) {
 789         free(tmp_str);
 790         tmp_str = NULL;
 791     }
 792 
 793     return exit_status;
 794 }
 795 
 796 /****************************
 797  * Command line tool request functions
 798  ****************************/
 799 /* JJH TODO - Move the command line functions here ? */
 800 
 801 /*****************************
 802  * Snapshot metadata functions
 803  *****************************/
 804 int orte_snapc_ckpt_state_str(char ** state_str, int state)
 805 {
 806     switch(state) {
 807     case ORTE_SNAPC_CKPT_STATE_NONE:
 808         *state_str = strdup(" -- ");
 809         break;
 810     case ORTE_SNAPC_CKPT_STATE_REQUEST:
 811         *state_str = strdup("Requested");
 812         break;
 813     case ORTE_SNAPC_CKPT_STATE_PENDING:
 814         *state_str = strdup("Pending");
 815         break;
 816     case ORTE_SNAPC_CKPT_STATE_RUNNING:
 817         *state_str = strdup("Running");
 818         break;
 819     case ORTE_SNAPC_CKPT_STATE_STOPPED:
 820         *state_str = strdup("Stopped");
 821         break;
 822     case ORTE_SNAPC_CKPT_STATE_MIGRATING:
 823         *state_str = strdup("Migrating");
 824         break;
 825     case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
 826         *state_str = strdup("Checkpoint Established");
 827         break;
 828     case ORTE_SNAPC_CKPT_STATE_RECOVERED:
 829         *state_str = strdup("Continuing/Recovered");
 830         break;
 831     case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
 832         *state_str = strdup("Locally Finished");
 833         break;
 834     case ORTE_SNAPC_CKPT_STATE_ERROR:
 835         *state_str = strdup("Error");
 836         break;
 837     default:
 838         opal_asprintf(state_str, "Unknown %d", state);
 839         break;
 840     }
 841 
 842     return ORTE_SUCCESS;
 843 }

/* [<][>][^][v][top][bottom][index][help] */