root/opal/mca/crs/self/crs_self_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. opal_crs_self_construct
  2. opal_crs_self_destruct
  3. opal_crs_self_component_query
  4. opal_crs_self_extract_callbacks
  5. opal_crs_self_module_init
  6. opal_crs_self_module_finalize
  7. opal_crs_self_checkpoint
  8. opal_crs_self_restart
  9. opal_crs_self_disable_checkpoint
  10. opal_crs_self_enable_checkpoint
  11. opal_crs_self_prelaunch
  12. opal_crs_self_reg_thread
  13. crs_self_find_function
  14. opal_crs_self_restart_cmd
  15. self_cold_start
  16. self_update_snapshot_metadata

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University.
   3  *                         All rights reserved.
   4  * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
   5  *                         All rights reserved.
   6  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   7  *                         University of Stuttgart.  All rights reserved.
   8  * Copyright (c) 2004-2005 The Regents of the University of California.
   9  *                         All rights reserved.
  10  * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
  11  *                         reserved.
  12  * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
  13  *
  14  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * $HEADER$
  20  */
  21 
  22 #include "opal_config.h"
  23 
  24 #include <sys/types.h>
  25 #ifdef HAVE_UNISTD_H
  26 #include <unistd.h>
  27 #endif  /* HAVE_UNISTD_H */
  28 #include <string.h>
  29 #ifdef HAVE_DLFCN_H
  30 #include <dlfcn.h>
  31 #endif
  32 
  33 #include "opal/util/opal_environ.h"
  34 #include "opal/util/output.h"
  35 #include "opal/util/show_help.h"
  36 #include "opal/util/argv.h"
  37 #include "opal/util/opal_environ.h"
  38 #include "opal/util/printf.h"
  39 
  40 #include "opal/constants.h"
  41 #include "opal/mca/base/mca_base_var.h"
  42 
  43 #include "opal/mca/crs/crs.h"
  44 #include "opal/mca/crs/base/base.h"
  45 #include "opal/runtime/opal_cr.h"
  46 
  47 #include "crs_self.h"
  48 
  49 /*
  50  * Self module
  51  */
  52 static opal_crs_base_module_t loc_module = {
  53     /** Initialization Function */
  54     opal_crs_self_module_init,
  55     /** Finalization Function */
  56     opal_crs_self_module_finalize,
  57 
  58     /** Checkpoint interface */
  59     opal_crs_self_checkpoint,
  60 
  61     /** Restart Command Access */
  62     opal_crs_self_restart,
  63 
  64     /** Disable checkpoints */
  65     opal_crs_self_disable_checkpoint,
  66     /** Enable checkpoints */
  67     opal_crs_self_enable_checkpoint,
  68 
  69     /** Prelaunch */
  70     opal_crs_self_prelaunch,
  71 
  72     /** Register Thread */
  73     opal_crs_self_reg_thread
  74 };
  75 
  76 /*
  77  * Snapshot structure
  78  */
  79 OBJ_CLASS_DECLARATION(opal_crs_self_snapshot_t);
  80 
  81 struct opal_crs_self_snapshot_t {
  82     /** Base CRS snapshot type */
  83     opal_crs_base_snapshot_t super;
  84     /** Command Line used to restart the app */
  85     char * cmd_line;
  86 };
  87 typedef struct opal_crs_self_snapshot_t opal_crs_self_snapshot_t;
  88 
  89 static void opal_crs_self_construct(opal_crs_self_snapshot_t *obj);
  90 static void opal_crs_self_destruct( opal_crs_self_snapshot_t *obj);
  91 
  92 OBJ_CLASS_INSTANCE(opal_crs_self_snapshot_t,
  93                    opal_crs_base_snapshot_t,
  94                    opal_crs_self_construct,
  95                    opal_crs_self_destruct);
  96 
  97 
  98 typedef void (*opal_crs_self_dlsym_dummy_fn_t)(void);
  99 
 100 /************************************
 101  * Locally Global vars & functions :)
 102  ************************************/
 103 static int crs_self_find_function(char *prefix, char *suffix,
 104                                   opal_crs_self_dlsym_dummy_fn_t *fn_ptr);
 105 
 106 static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot);
 107 
 108 static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd);
 109 static int self_cold_start(opal_crs_self_snapshot_t *snapshot);
 110 
 111 void opal_crs_self_construct(opal_crs_self_snapshot_t *snapshot)
 112 {
 113     snapshot->cmd_line = NULL;
 114 }
 115 
 116 void opal_crs_self_destruct( opal_crs_self_snapshot_t *snapshot)
 117 {
 118     if(NULL != snapshot->cmd_line)
 119         free(snapshot->cmd_line);
 120 }
 121 
 122 static int opal_crs_self_extract_callbacks(void);
 123 
 124 /*
 125  * MCA Functions
 126  */
 127 int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
 128 {
 129     int ret;
 130 
 131     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 132                         "crs:self: component_query()");
 133 
 134     /*
 135      * If this is a tool, then return a module with the lowest priority.
 136      * This allows 'mpirun' to select the 'none' component since it has
 137      * a priority higher than 0.
 138      * But also allows 'opal-restart' to select this component if needed
 139      * since it only ever requests that a specific component be opened
 140      * that is defined in the snapshot metadata file.
 141      */
 142     if( opal_cr_is_tool ) {
 143         *priority = 0;
 144         *module = (mca_base_module_t *)&loc_module;
 145         return OPAL_SUCCESS;
 146     }
 147 
 148     /*
 149      * Extract the user level callbacks if they exist
 150      */
 151     ret = opal_crs_self_extract_callbacks();
 152 
 153     if( OPAL_SUCCESS != ret ||
 154         !mca_crs_self_component.can_checkpoint ) {
 155         *priority = -1;
 156         *module = NULL;
 157         return OPAL_ERROR;
 158     }
 159     else {
 160         *priority = mca_crs_self_component.super.priority;
 161         *module = (mca_base_module_t *)&loc_module;
 162         return OPAL_SUCCESS;
 163     }
 164 }
 165 
 166 static int opal_crs_self_extract_callbacks(void)
 167 {
 168     opal_crs_self_dlsym_dummy_fn_t loc_fn;
 169 
 170     /*
 171      * Find the function names
 172      */
 173     crs_self_find_function(mca_crs_self_component.prefix,
 174                            SUFFIX_CHECKPOINT,
 175                            &loc_fn);
 176     mca_crs_self_component.ucb_checkpoint_fn = (opal_crs_self_checkpoint_callback_fn_t)loc_fn;
 177 
 178     crs_self_find_function(mca_crs_self_component.prefix,
 179                            SUFFIX_CONTINUE,
 180                            &loc_fn);
 181     mca_crs_self_component.ucb_continue_fn = (opal_crs_self_continue_callback_fn_t)loc_fn;
 182 
 183     crs_self_find_function(mca_crs_self_component.prefix,
 184                            SUFFIX_RESTART,
 185                            &loc_fn);
 186     mca_crs_self_component.ucb_restart_fn = (opal_crs_self_restart_callback_fn_t)loc_fn;
 187 
 188     /*
 189      * Sanity check
 190      */
 191     mca_crs_self_component.can_checkpoint = true;
 192 
 193     if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
 194         mca_crs_self_component.can_checkpoint = false;
 195     }
 196     if(NULL == mca_crs_self_component.ucb_continue_fn) {
 197     }
 198     if(NULL == mca_crs_self_component.ucb_restart_fn) {
 199     }
 200 
 201     return OPAL_SUCCESS;
 202 }
 203 
 204 int opal_crs_self_module_init(void)
 205 {
 206     bool callback_matched = true;
 207 
 208     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 209                         "crs:self: module_init()");
 210 
 211     if( opal_cr_is_tool ) {
 212         return OPAL_SUCCESS;
 213     }
 214 
 215     /*
 216      * Sanity check
 217      */
 218     if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
 219         callback_matched = false;
 220         mca_crs_self_component.can_checkpoint = false;
 221     }
 222     if(NULL == mca_crs_self_component.ucb_continue_fn) {
 223         callback_matched = false;
 224     }
 225     if(NULL == mca_crs_self_component.ucb_restart_fn) {
 226         callback_matched = false;
 227     }
 228     if( !callback_matched ) {
 229         if( 1 <= mca_crs_self_component.super.verbose ) {
 230             opal_show_help("help-opal-crs-self.txt", "self:no_callback", false,
 231                            "checkpoint", mca_crs_self_component.prefix, SUFFIX_CHECKPOINT,
 232                            "continue  ", mca_crs_self_component.prefix, SUFFIX_CONTINUE,
 233                            "restart   ", mca_crs_self_component.prefix, SUFFIX_RESTART,
 234                            PREFIX_DEFAULT);
 235         }
 236     }
 237 
 238     /*
 239      * If the user requested that we do_restart, then call their callback
 240      */
 241     if(mca_crs_self_component.do_restart) {
 242         opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 243                             "crs:self: module_init: Call their restart function");
 244         if( NULL != mca_crs_self_component.ucb_restart_fn)
 245             mca_crs_self_component.ucb_restart_fn();
 246     }
 247 
 248     return OPAL_SUCCESS;
 249 }
 250 
 251 int opal_crs_self_module_finalize(void)
 252 {
 253     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 254                         "crs:self: module_finalize()");
 255 
 256     return OPAL_SUCCESS;
 257 }
 258 
 259 
 260 int opal_crs_self_checkpoint(pid_t pid,
 261                              opal_crs_base_snapshot_t *base_snapshot,
 262                              opal_crs_base_ckpt_options_t *options,
 263                              opal_crs_state_type_t *state)
 264 {
 265     opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
 266     int ret, exit_status = OPAL_SUCCESS;
 267     char * restart_cmd = NULL;
 268 
 269     /*
 270      * This function should never be called by a tool
 271      */
 272     if( opal_cr_is_tool ) {
 273         return OPAL_ERR_NOT_SUPPORTED;
 274     }
 275 
 276     if( options->stop ) {
 277         opal_output(0,
 278                     "crs:self: checkpoint(): Error: SIGSTOP Not currently supported!");
 279     }
 280 
 281     /*
 282      * Setup for snapshot directory creation
 283      */
 284     snapshot->super = *base_snapshot;
 285 #if 0
 286     snapshot->super.snapshot_directory = strdup(base_snapshot->snapshot_directory);
 287     snapshot->super.metadata_filename  = strdup(base_snapshot->metadata_filename);
 288 #endif
 289 
 290     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 291                         "crs:self: checkpoint(%d, ---)", pid);
 292 
 293     if(!mca_crs_self_component.can_checkpoint) {
 294         opal_show_help("help-opal-crs-self.txt", "self:ckpt_disabled", false);
 295         exit_status = OPAL_ERROR;
 296         goto cleanup;
 297     }
 298 
 299     /*
 300      * Update the snapshot metadata
 301      */
 302     snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
 303     if( NULL == snapshot->super.metadata ) {
 304         if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
 305             opal_output(mca_crs_self_component.super.output_handle,
 306                         "crs:self: checkpoint(): Error: Unable to open the file (%s)",
 307                         snapshot->super.metadata_filename);
 308             exit_status = OPAL_ERROR;
 309             goto cleanup;
 310         }
 311     }
 312     fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
 313 
 314     /*
 315      * Call the user callback function
 316      */
 317     if(NULL != mca_crs_self_component.ucb_checkpoint_fn) {
 318         mca_crs_self_component.ucb_checkpoint_fn(&restart_cmd);
 319     }
 320 
 321     /*
 322      * Save the restart command
 323      */
 324     if( NULL == restart_cmd) {
 325         *state = OPAL_CRS_ERROR;
 326         opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
 327                        true);
 328         exit_status = OPAL_ERROR;
 329         goto cleanup;
 330     }
 331     else {
 332         snapshot->cmd_line = strdup(restart_cmd);
 333 
 334         opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 335                             "crs:self: checkpoint: Restart Command (%s)", snapshot->cmd_line);
 336     }
 337 
 338     /*
 339      * The best we can do is update the metadata file with the
 340      * application argv and argc we started with.
 341      */
 342     if( OPAL_SUCCESS != (ret = self_update_snapshot_metadata(snapshot)) ) {
 343         *state = OPAL_CRS_ERROR;
 344         opal_output(mca_crs_self_component.super.output_handle,
 345                     "crs:self: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
 346                     snapshot->super.metadata_filename);
 347         exit_status = ret;
 348         goto cleanup;
 349     }
 350 
 351 
 352     *state = OPAL_CRS_CONTINUE;
 353 
 354     /*
 355      * Call their continue routine for completeness
 356      */
 357     if(NULL != mca_crs_self_component.ucb_continue_fn) {
 358         mca_crs_self_component.ucb_continue_fn();
 359     }
 360 
 361     base_snapshot = &(snapshot->super);
 362 
 363  cleanup:
 364     if( NULL != restart_cmd) {
 365         free(restart_cmd);
 366         restart_cmd = NULL;
 367     }
 368 
 369     return exit_status;
 370 }
 371 
 372 /*
 373  * Notice that the user restart callback is not called here, but always from
 374  *  opal_init for the self module.
 375  */
 376 int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
 377 {
 378     opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
 379     char **cr_argv = NULL;
 380     char * cr_cmd = NULL;
 381     int ret;
 382     int exit_status = OPAL_SUCCESS;
 383     int status;
 384 
 385     snapshot->super = *base_snapshot;
 386 
 387     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 388                         "crs:self: restart(%d)", spawn_child);
 389 
 390     /*
 391      * If we need to reconstruct the snapshot
 392      */
 393     if(snapshot->super.cold_start) {
 394         if( OPAL_SUCCESS != (ret = self_cold_start(snapshot)) ){
 395             exit_status = ret;
 396             opal_output(mca_crs_self_component.super.output_handle,
 397                         "crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
 398             goto cleanup;
 399         }
 400     }
 401 
 402     /*
 403      * JJH: Check to make sure the application exists?
 404      */
 405 
 406     /*
 407      * Get the restart command
 408      */
 409     if ( OPAL_SUCCESS != (ret = opal_crs_self_restart_cmd(snapshot, &cr_cmd)) ) {
 410         exit_status = ret;
 411         goto cleanup;
 412     }
 413     if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
 414         exit_status = OPAL_ERROR;
 415         goto cleanup;
 416     }
 417 
 418 
 419     if (!spawn_child) {
 420         opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 421                             "crs:self: self_restart: SELF: exec :(%s, %s):",
 422                             strdup(cr_argv[0]),
 423                             opal_argv_join(cr_argv, ' '));
 424 
 425         status = execvp(strdup(cr_argv[0]), cr_argv);
 426 
 427         if(status < 0) {
 428             opal_output(mca_crs_self_component.super.output_handle,
 429                         "crs:self: self_restart: SELF: Child failed to execute :(%d):", status);
 430         }
 431         opal_output(mca_crs_self_component.super.output_handle,
 432                     "crs:self: self_restart: SELF: execvp returned %d", status);
 433         exit_status = status;
 434         goto cleanup;
 435     }
 436     else {
 437         *child_pid = fork();
 438         if( *child_pid == 0) {
 439             /* Child Process */
 440             opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 441                                 "crs:self: self_restart: CHILD: exec :(%s, %s):",
 442                                 strdup(cr_argv[0]),
 443                                 opal_argv_join(cr_argv, ' '));
 444 
 445             status = execvp(strdup(cr_argv[0]), cr_argv);
 446 
 447             if(status < 0) {
 448                 opal_output(mca_crs_self_component.super.output_handle,
 449                             "crs:self: self_restart: CHILD: Child failed to execute :(%d):", status);
 450             }
 451             opal_output(mca_crs_self_component.super.output_handle,
 452                         "crs:self: self_restart: CHILD: execvp returned %d", status);
 453             exit_status = status;
 454             goto cleanup;
 455         }
 456         else if(*child_pid > 0) {
 457             /* Parent is done once it is started. */
 458             ;
 459         }
 460         else {
 461             opal_output(mca_crs_self_component.super.output_handle,
 462                         "crs:self: self_restart: CHILD: fork failed :(%d):", *child_pid);
 463         }
 464     }
 465 
 466  cleanup:
 467     if( NULL != cr_cmd)
 468         free(cr_cmd);
 469     if( NULL != cr_argv)
 470         opal_argv_free(cr_argv);
 471 
 472     return exit_status;
 473 }
 474 
 475 int opal_crs_self_disable_checkpoint(void)
 476 {
 477     /*
 478      * This function should never be called by a tool
 479      */
 480     if( opal_cr_is_tool ) {
 481         return OPAL_ERR_NOT_SUPPORTED;
 482     }
 483 
 484     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 485                         "crs:self: disable_checkpoint()");
 486 
 487     mca_crs_self_component.can_checkpoint = false;
 488 
 489     return OPAL_SUCCESS;
 490 }
 491 
 492 int opal_crs_self_enable_checkpoint(void)
 493 {
 494     /*
 495      * This function should never be called by a tool
 496      */
 497     if( opal_cr_is_tool ) {
 498         return OPAL_ERR_NOT_SUPPORTED;
 499     }
 500 
 501     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 502                         "crs:self: enable_checkpoint()");
 503 
 504     mca_crs_self_component.can_checkpoint = true;
 505 
 506     return OPAL_SUCCESS;
 507 }
 508 
 509 int opal_crs_self_prelaunch(int32_t rank,
 510                             char *base_snapshot_dir,
 511                             char **app,
 512                             char **cwd,
 513                             char ***argv,
 514                             char ***env)
 515 {
 516     char * tmp_env_var = NULL;
 517 
 518     /*
 519      * This function should never be called by a tool
 520      */
 521     if( opal_cr_is_tool ) {
 522         return OPAL_ERR_NOT_SUPPORTED;
 523     }
 524 
 525     (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
 526     opal_setenv(tmp_env_var,
 527                 "0", true, env);
 528     free(tmp_env_var);
 529     tmp_env_var = NULL;
 530 
 531     return OPAL_SUCCESS;
 532 }
 533 
 534 int opal_crs_self_reg_thread(void)
 535 {
 536     /*
 537      * This function should never be called by a tool
 538      */
 539     if( opal_cr_is_tool ) {
 540         return OPAL_ERR_NOT_SUPPORTED;
 541     }
 542 
 543     return OPAL_SUCCESS;
 544 }
 545 
 546 /******************
 547  * Local functions
 548  ******************/
 549 static int crs_self_find_function(char *prefix, char *suffix,
 550                                   opal_crs_self_dlsym_dummy_fn_t *fn_ptr) {
 551     char *func_to_find = NULL;
 552 
 553     if( NULL == prefix || 0 >= strlen(prefix) ) {
 554         opal_output(mca_crs_self_component.super.output_handle,
 555                     "crs:self: crs_self_find_function: Error: prefix is NULL or empty string!");
 556         *fn_ptr = NULL;
 557         return OPAL_ERROR;
 558     }
 559     if( NULL == suffix || 0 >= strlen(suffix) ) {
 560         opal_output(mca_crs_self_component.super.output_handle,
 561                     "crs:self: crs_self_find_function: Error: suffix is NULL or empty string!");
 562         *fn_ptr = NULL;
 563         return OPAL_ERROR;
 564     }
 565 
 566     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 567                         "crs:self: crs_self_find_function(--, %s, %s)",
 568                         prefix, suffix);
 569 
 570     opal_asprintf(&func_to_find, "%s_%s", prefix, suffix);
 571 
 572     /* The RTLD_DEFAULT is a special handle that searches the default libraries
 573      * including the current application for the indicated symbol. This allows
 574      * us to not have to dlopen/dlclose the executable. A bit of short hand
 575      * really.
 576      */
 577     *((void**) fn_ptr) = dlsym(RTLD_DEFAULT, func_to_find);
 578     if( NULL == fn_ptr) {
 579         opal_output_verbose(12, mca_crs_self_component.super.output_handle,
 580                             "crs:self: crs_self_find_function: WARNING: Function \"%s\" not found",
 581                             func_to_find);
 582     }
 583     else {
 584         opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 585                             "crs:self: crs_self_find_function: Found function \"%s\"",
 586                             func_to_find);
 587     }
 588 
 589     if( NULL == func_to_find) {
 590         free(func_to_find);
 591     }
 592 
 593     return OPAL_SUCCESS;
 594 }
 595 
 596 /*
 597  * Self is a special case. The 'fname' here is the command line that the user
 598  * wishes to execute. This function takes this command line and adds
 599  *   -mca crs_self_do_restart 1
 600  * Which will trigger the restart callback once the program has been run.
 601  *
 602  * For example, The user starts their program with:
 603  *   $ my_prog arg1 arg2
 604  *
 605  * They checkpoint it:
 606  *   $ opal_checkpoint -mca crs self 1234
 607  *
 608  * They restart it:
 609  *   $ opal_restart -mca crs self my_prog arg1 arg2
 610  *
 611  * fname is then:
 612  *   fname = "my_prog arg1 arg2"
 613  *
 614  * This funciton translates that to the command:
 615  *   cmd = "my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1"
 616  *
 617  * Which will cause the program "my_prog" to call their restart function
 618  * upon opal_init time.
 619  *
 620  * Note: The user could bypass the opal_restart routine safely by simply calling
 621  *   $ my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1
 622  * However, for consistency sake, we should not encourage this as it won't work for
 623  * all of the other checkpointers.
 624  */
 625 static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
 626 {
 627     char * tmp_env_var = NULL;
 628 
 629     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 630                         "crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
 631 
 632     (void) mca_base_var_env_name("crs", &tmp_env_var);
 633     opal_setenv(tmp_env_var,
 634                 "self",
 635                 true, &environ);
 636     free(tmp_env_var);
 637     tmp_env_var = NULL;
 638 
 639     (void) mca_base_var_env_name("crs_self_do_restart", &tmp_env_var);
 640     opal_setenv(tmp_env_var,
 641                 "1",
 642                 true, &environ);
 643     free(tmp_env_var);
 644     tmp_env_var = NULL;
 645 
 646     (void) mca_base_var_env_name("crs_self_prefix", &tmp_env_var);
 647     opal_setenv(tmp_env_var,
 648                 mca_crs_self_component.prefix,
 649                 true, &environ);
 650     free(tmp_env_var);
 651     tmp_env_var = NULL;
 652 
 653     /* Instead of adding it to the command line, we should use the environment
 654      * to pass the values. This allow sthe OPAL application to be braindead
 655      * WRT MCA parameters
 656      *   add_args = strdup("-mca crs self -mca crs_self_do_restart 1");
 657      */
 658 
 659     opal_asprintf(cmd, "%s", snapshot->cmd_line);
 660 
 661     return OPAL_SUCCESS;
 662 }
 663 
 664 static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
 665     int ret, exit_status = OPAL_SUCCESS;
 666     char **tmp_argv = NULL;
 667     char * component_name = NULL;
 668     int prev_pid;
 669 
 670     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 671                         "crs:self: cold_start()");
 672 
 673     /*
 674      * Find the snapshot directory, read the metadata file
 675      */
 676     if( NULL == snapshot->super.metadata ) {
 677         if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
 678             opal_output(mca_crs_self_component.super.output_handle,
 679                         "crs:self: checkpoint(): Error: Unable to open the file (%s)",
 680                         snapshot->super.metadata_filename);
 681             exit_status = OPAL_ERROR;
 682             goto cleanup;
 683         }
 684     }
 685     if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
 686                                                                         &component_name, &prev_pid) ) ) {
 687         opal_output(mca_crs_self_component.super.output_handle,
 688                     "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
 689                     snapshot->super.metadata_filename, ret);
 690         exit_status = ret;
 691         goto cleanup;
 692     }
 693 
 694     snapshot->super.component_name = strdup(component_name);
 695 
 696     /* Compare the strings to make sure this is our snapshot before going further */
 697     if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name,
 698                       component_name, strlen(component_name)) ) {
 699         exit_status = OPAL_ERROR;
 700         opal_output(mca_crs_self_component.super.output_handle,
 701                     "crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
 702                     component_name, mca_crs_self_component.super.base_version.mca_component_name);
 703         goto cleanup;
 704     }
 705 
 706     /*
 707      * Restart command
 708      * JJH: Command lines limited to 256 chars.
 709      */
 710     opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
 711     if( NULL == tmp_argv ) {
 712         opal_output(mca_crs_self_component.super.output_handle,
 713                     "crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
 714                     CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
 715         exit_status = OPAL_ERROR;
 716         goto cleanup;
 717     }
 718     opal_asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);
 719 
 720     /*
 721      * Reset the cold_start flag
 722      */
 723     snapshot->super.cold_start = false;
 724 
 725  cleanup:
 726     if(NULL != tmp_argv) {
 727         opal_argv_free(tmp_argv);
 728         tmp_argv = NULL;
 729     }
 730 
 731     return exit_status;
 732 
 733 }
 734 
 735 static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
 736     int exit_status = OPAL_SUCCESS;
 737 
 738     if(NULL == snapshot->cmd_line) {
 739         opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
 740                        true);
 741         exit_status = OPAL_ERROR;
 742         goto cleanup;
 743     }
 744 
 745     opal_output_verbose(10, mca_crs_self_component.super.output_handle,
 746                         "crs:self: update_snapshot_metadata(%s)",
 747                         snapshot->super.metadata_filename);
 748 
 749     /*
 750      * Append to the metadata file the command line to restart with
 751      *  - How user wants us to restart
 752      */
 753     fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->cmd_line);
 754 
 755  cleanup:
 756     return exit_status;
 757 }

/* [<][>][^][v][top][bottom][index][help] */