This source file includes following definitions.
- main
- initialize
- finalize
- parse_args
- check_file
- post_env_vars
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 
  32 
  33 
  34 
  35 #include "opal_config.h"
  36 
  37 #include <stdio.h>
  38 #include <errno.h>
  39 #ifdef HAVE_UNISTD_H
  40 #include <unistd.h>
  41 #endif  
  42 #include <stdlib.h>
  43 #ifdef HAVE_SYS_STAT_H
  44 #include <sys/stat.h>
  45 #endif
  46 #ifdef HAVE_FCNTL_H
  47 #include <fcntl.h>
  48 #endif  
  49 #ifdef HAVE_SYS_TYPES_H
  50 #include <sys/types.h>
  51 #endif
  52 #ifdef HAVE_SYS_WAIT_H
  53 #include <sys/wait.h>
  54 #endif
  55 #include <string.h>
  56 
  57 #include "opal/constants.h"
  58 
  59 #include "opal/util/cmd_line.h"
  60 #include "opal/util/argv.h"
  61 #include "opal/util/show_help.h"
  62 #include "opal/util/output.h"
  63 #include "opal/util/opal_environ.h"
  64 #include "opal/util/error.h"
  65 #include "opal/util/basename.h"
  66 #include "opal/util/printf.h"
  67 #include "opal/mca/base/base.h"
  68 
  69 #include "opal/runtime/opal.h"
  70 #include "opal/runtime/opal_cr.h"
  71 
  72 #include "opal/mca/crs/crs.h"
  73 #include "opal/mca/crs/base/base.h"
  74 
  75 #include "opal/mca/compress/compress.h"
  76 #include "opal/mca/compress/base/base.h"
  77 
  78 
  79 
  80 
  81 static int initialize(int argc, char *argv[]);
  82 static int finalize(void);
  83 static int parse_args(int argc, char *argv[]);
  84 static int check_file(void);
  85 static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot);
  86 
  87 
  88 
  89 
  90 static char *expected_crs_comp = NULL;
  91 
  92 typedef struct {
  93     bool help;
  94     bool verbose;
  95     char *snapshot_ref;
  96     char *snapshot_loc;
  97     char *snapshot_metadata;
  98     char *snapshot_cache;
  99     char *snapshot_compress;
 100     char *snapshot_compress_postfix;
 101     int  output;
 102 } opal_restart_globals_t;
 103 
 104 opal_restart_globals_t opal_restart_globals;
 105 
 106 opal_cmd_line_init_t cmd_line_opts[] = {
 107     { NULL,
 108       'h', NULL, "help",
 109       0,
 110       &opal_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
 111       "This help message" },
 112 
 113     { NULL,
 114       'v', NULL, "verbose",
 115       0,
 116       &opal_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
 117       "Be Verbose" },
 118 
 119     { NULL,
 120       'l', NULL, "location",
 121       1,
 122       &opal_restart_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
 123       "Full path to the location of the local snapshot."},
 124 
 125     { NULL,
 126       'm', NULL, "metadata",
 127       1,
 128       &opal_restart_globals.snapshot_metadata, OPAL_CMD_LINE_TYPE_STRING,
 129       "Relative path (with respect to --location) to the metadata file."},
 130 
 131     { NULL,
 132       'r', NULL, "reference",
 133       1,
 134       &opal_restart_globals.snapshot_ref, OPAL_CMD_LINE_TYPE_STRING,
 135       "Local snapshot reference."},
 136 
 137     { NULL,
 138       'c', NULL, "cache",
 139       1,
 140       &opal_restart_globals.snapshot_cache, OPAL_CMD_LINE_TYPE_STRING,
 141       "Possible local cache of the snapshot reference."},
 142 
 143     { NULL,
 144       'd', NULL, "decompress",
 145       1,
 146       &opal_restart_globals.snapshot_compress, OPAL_CMD_LINE_TYPE_STRING,
 147       "Decompression component to use."},
 148 
 149     { NULL,
 150       'p', NULL, "decompress_postfix",
 151       1,
 152       &opal_restart_globals.snapshot_compress_postfix, OPAL_CMD_LINE_TYPE_STRING,
 153       "Decompression component postfix."},
 154 
 155     
 156     { NULL,
 157       '\0', NULL, NULL,
 158       0,
 159       NULL, OPAL_CMD_LINE_TYPE_NULL,
 160       NULL }
 161 };
 162 
 163 int
 164 main(int argc, char *argv[])
 165 {
 166     int ret, exit_status = OPAL_SUCCESS;
 167     int child_pid;
 168     int prev_pid = 0;
 169     int idx;
 170     opal_crs_base_snapshot_t *snapshot = NULL;
 171     char * tmp_env_var = NULL;
 172     bool select = false;
 173 
 174     
 175 
 176 
 177     if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
 178         exit_status = ret;
 179         goto cleanup;
 180     }
 181 
 182     
 183 
 184 
 185     if( OPAL_SUCCESS != (ret = check_file() )) {
 186         opal_show_help("help-opal-restart.txt", "invalid_filename", true,
 187                        opal_restart_globals.snapshot_ref);
 188         exit_status = ret;
 189         goto cleanup;
 190     }
 191 
 192     
 193     idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");
 194 
 195     if (0 > idx) {
 196         opal_output(opal_restart_globals.output,
 197                     "MCA variable opal_crs_base_do_not_select not found\n");
 198         exit_status = OPAL_ERROR;
 199         goto cleanup;
 200     }
 201 
 202     ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
 203     if (OPAL_SUCCESS != ret) {
 204         exit_status = ret;
 205         goto cleanup;
 206     }
 207 
 208     
 209 
 210 
 211     if(NULL == expected_crs_comp) {
 212         char * full_metadata_path = NULL;
 213         FILE * metadata = NULL;
 214 
 215         opal_asprintf(&full_metadata_path, "%s/%s/%s",
 216                  opal_restart_globals.snapshot_loc,
 217                  opal_restart_globals.snapshot_ref,
 218                  opal_restart_globals.snapshot_metadata);
 219         if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
 220             opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
 221                            opal_restart_globals.snapshot_metadata,
 222                            full_metadata_path);
 223             exit_status = OPAL_ERROR;
 224             goto cleanup;
 225         }
 226         if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
 227                                                                             &expected_crs_comp,
 228                                                                             &prev_pid)) ) {
 229             opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
 230                            opal_restart_globals.snapshot_metadata,
 231                            full_metadata_path);
 232             exit_status = ret;
 233             goto cleanup;
 234         }
 235 
 236         free(full_metadata_path);
 237         full_metadata_path = NULL;
 238 
 239         fclose(metadata);
 240         metadata = NULL;
 241     }
 242 
 243     opal_output_verbose(10, opal_restart_globals.output,
 244                         "Restart Expects checkpointer: (%s)",
 245                         expected_crs_comp);
 246 
 247     (void) mca_base_var_env_name("crs", &tmp_env_var);
 248     opal_setenv(tmp_env_var,
 249                 expected_crs_comp,
 250                 true, &environ);
 251     free(tmp_env_var);
 252     tmp_env_var = NULL;
 253 
 254     
 255 
 256 
 257 
 258 
 259     if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
 260         opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
 261                        "crs", ret);
 262         exit_status = ret;
 263         goto cleanup;
 264     }
 265 
 266     if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
 267         opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
 268                        expected_crs_comp, ret);
 269         exit_status = ret;
 270         goto cleanup;
 271     }
 272 
 273     
 274 
 275 
 276     if(NULL == expected_crs_comp ||
 277        0 != strncmp(expected_crs_comp,
 278                     opal_crs_base_selected_component.base_version.mca_component_name,
 279                     strlen(expected_crs_comp)) ) {
 280         opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
 281                        true,
 282                        expected_crs_comp,
 283                        opal_crs_base_selected_component.base_version.mca_component_name,
 284                        ret);
 285         exit_status = ret;
 286         goto cleanup;
 287     }
 288 
 289     
 290 
 291 
 292     opal_output_verbose(10, opal_restart_globals.output,
 293                         "Restarting from file (%s)\n",
 294                         opal_restart_globals.snapshot_ref);
 295 
 296     snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
 297     snapshot->cold_start         = true;
 298     opal_asprintf(&(snapshot->snapshot_directory), "%s/%s",
 299              opal_restart_globals.snapshot_loc,
 300              opal_restart_globals.snapshot_ref);
 301     opal_asprintf(&(snapshot->metadata_filename), "%s/%s",
 302              snapshot->snapshot_directory,
 303              opal_restart_globals.snapshot_metadata);
 304 
 305     
 306 
 307 
 308 
 309 
 310 
 311     if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
 312         exit_status = ret;
 313         goto cleanup;
 314     }
 315 
 316     
 317 
 318 
 319     ret = opal_crs.crs_restart(snapshot,
 320                                false,
 321                                &child_pid);
 322 
 323     if (OPAL_SUCCESS != ret) {
 324         opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
 325                        opal_restart_globals.snapshot_ref,
 326                        ret,
 327                        opal_crs_base_selected_component.base_version.mca_component_name);
 328         exit_status = ret;
 329         goto cleanup;
 330     }
 331     
 332 
 333     
 334 
 335 
 336  cleanup:
 337     if (OPAL_SUCCESS != (ret = finalize())) {
 338         return ret;
 339     }
 340 
 341     if(NULL != snapshot )
 342         OBJ_DESTRUCT(snapshot);
 343 
 344     return exit_status;
 345 }
 346 
 347 static int initialize(int argc, char *argv[])
 348 {
 349     int ret, exit_status = OPAL_SUCCESS;
 350     char * tmp_env_var = NULL;
 351 
 352     
 353 
 354 
 355 
 356 
 357     if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
 358         return ret;
 359     }
 360 
 361     
 362 
 363 
 364     if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
 365         exit_status = ret;
 366         goto cleanup;
 367     }
 368 
 369     
 370 
 371 
 372     if( opal_restart_globals.verbose ) {
 373         opal_restart_globals.output = opal_output_open(NULL);
 374         opal_output_set_verbosity(opal_restart_globals.output, 10);
 375     } else {
 376         opal_restart_globals.output = 0; 
 377     }
 378 
 379     
 380 
 381 
 382 
 383     (void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var);
 384     opal_setenv(tmp_env_var,
 385                 "1", 
 386                 true, &environ);
 387     free(tmp_env_var);
 388     tmp_env_var = NULL;
 389 
 390     
 391 
 392 
 393     if( NULL != opal_restart_globals.snapshot_compress ) {
 394         (void) mca_base_var_env_name("compress", &tmp_env_var);
 395         opal_setenv(tmp_env_var,
 396                     opal_restart_globals.snapshot_compress,
 397                     true, &environ);
 398         free(tmp_env_var);
 399         tmp_env_var = NULL;
 400     }
 401 
 402     
 403 
 404 
 405     if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
 406         exit_status = ret;
 407         goto cleanup;
 408     }
 409 
 410     
 411 
 412 
 413     if( NULL != opal_restart_globals.snapshot_compress ) {
 414         char * zip_dir = NULL;
 415         char * tmp_str = NULL;
 416 
 417         
 418 
 419 
 420 
 421         (void) mca_base_var_env_name("compress", &tmp_env_var);
 422         opal_unsetenv(tmp_env_var, &environ);
 423         free(tmp_env_var);
 424         tmp_env_var = NULL;
 425 
 426         opal_asprintf(&zip_dir, "%s/%s%s",
 427                  opal_restart_globals.snapshot_loc,
 428                  opal_restart_globals.snapshot_ref,
 429                  opal_restart_globals.snapshot_compress_postfix);
 430 
 431         if (0 >  (ret = access(zip_dir, F_OK)) ) {
 432             opal_output(opal_restart_globals.output,
 433                         "Error: Unable to access the file [%s]!",
 434                         zip_dir);
 435             exit_status = OPAL_ERROR;
 436             goto cleanup;
 437         }
 438 
 439         opal_output_verbose(10, opal_restart_globals.output,
 440                             "Decompressing (%s)",
 441                             zip_dir);
 442 
 443         opal_compress.decompress(zip_dir, &tmp_str);
 444 
 445         if( NULL != zip_dir ) {
 446             free(zip_dir);
 447             zip_dir = NULL;
 448         }
 449         if( NULL != tmp_str ) {
 450             free(tmp_str);
 451             tmp_str = NULL;
 452         }
 453     }
 454 
 455     
 456 
 457 
 458     if( NULL != opal_restart_globals.snapshot_cache ) {
 459         if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) {
 460             opal_output_verbose(10, opal_restart_globals.output,
 461                                 "Using the cached snapshot (%s) instead of (%s)",
 462                                 opal_restart_globals.snapshot_cache,
 463                                 opal_restart_globals.snapshot_loc);
 464             if( NULL != opal_restart_globals.snapshot_loc ) {
 465                 free(opal_restart_globals.snapshot_loc);
 466                 opal_restart_globals.snapshot_loc = NULL;
 467             }
 468             opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache);
 469         } else {
 470             opal_show_help("help-opal-restart.txt", "cache_not_avail", true,
 471                            opal_restart_globals.snapshot_cache,
 472                            opal_restart_globals.snapshot_loc);
 473         }
 474     }
 475 
 476     
 477 
 478 
 479     opal_cr_is_tool = true;
 480 
 481  cleanup:
 482     return exit_status;
 483 }
 484 
 485 static int finalize(void)
 486 {
 487 #if 0
 488     int ret;
 489 
 490     
 491 
 492 
 493 
 494 
 495 
 496 
 497 
 498     if (OPAL_SUCCESS != (ret = opal_finalize())) {
 499         return ret;
 500     }
 501 #endif
 502 
 503     return OPAL_SUCCESS;
 504 }
 505 
 506 static int parse_args(int argc, char *argv[])
 507 {
 508     int i, ret, len;
 509     opal_cmd_line_t cmd_line;
 510     char **app_env = NULL, **global_env = NULL;
 511 
 512     opal_restart_globals.help = false;
 513     opal_restart_globals.verbose = false;
 514     opal_restart_globals.snapshot_ref = NULL;
 515     opal_restart_globals.snapshot_loc = NULL;
 516     opal_restart_globals.snapshot_metadata = NULL;
 517     opal_restart_globals.snapshot_cache = NULL;
 518     opal_restart_globals.snapshot_compress = NULL;
 519     opal_restart_globals.snapshot_compress_postfix = NULL;
 520     opal_restart_globals.output = 0;
 521 
 522     
 523     opal_cmd_line_create(&cmd_line, cmd_line_opts);
 524 
 525     mca_base_open();
 526     mca_base_cmd_line_setup(&cmd_line);
 527     ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
 528     if (OPAL_SUCCESS != ret) {
 529         if (OPAL_ERR_SILENT != ret) {
 530             fprintf(stderr, "%s: command line error (%s)\n", argv[0],
 531                     opal_strerror(ret));
 532         }
 533         return 1;
 534     }
 535     if (opal_restart_globals.help ) {
 536         char *str, *args = NULL;
 537         args = opal_cmd_line_get_usage_msg(&cmd_line);
 538         str = opal_show_help_string("help-opal-restart.txt", "usage", true,
 539                                     args);
 540         if (NULL != str) {
 541             printf("%s", str);
 542             free(str);
 543         }
 544         free(args);
 545         
 546         exit(0);
 547     }
 548 
 549     
 550 
 551 
 552     mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
 553 
 554     len = opal_argv_count(app_env);
 555     for(i = 0; i < len; ++i) {
 556         putenv(app_env[i]);
 557     }
 558 
 559     len = opal_argv_count(global_env);
 560     for(i = 0; i < len; ++i) {
 561         putenv(global_env[i]);
 562     }
 563 
 564     
 565 
 566 
 567     
 568     opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
 569 
 570     if ( NULL == opal_restart_globals.snapshot_ref ||
 571          0 >= strlen(opal_restart_globals.snapshot_ref) ) {
 572         opal_show_help("help-opal-restart.txt", "invalid_filename", true,
 573                        "<none provided>");
 574         return OPAL_ERROR;
 575     }
 576 
 577     
 578 
 579 
 580 
 581     if(argc > 0) {
 582         opal_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
 583     }
 584 
 585     return OPAL_SUCCESS;
 586 }
 587 
 588 static int check_file(void)
 589 {
 590     int exit_status = OPAL_SUCCESS;
 591     int ret;
 592     char * path_to_check = NULL;
 593 
 594     if(NULL == opal_restart_globals.snapshot_ref) {
 595         opal_output(opal_restart_globals.output,
 596                     "Error: No filename provided!");
 597         exit_status = OPAL_ERROR;
 598         goto cleanup;
 599     }
 600 
 601     
 602 
 603 
 604     opal_asprintf(&path_to_check, "%s/%s",
 605              opal_restart_globals.snapshot_loc,
 606              opal_restart_globals.snapshot_ref);
 607 
 608     opal_output_verbose(10, opal_restart_globals.output,
 609                         "Checking for the existence of (%s)",
 610                         path_to_check);
 611 
 612     if (0 >  (ret = access(path_to_check, F_OK)) ) {
 613         exit_status = OPAL_ERROR;
 614         goto cleanup;
 615     }
 616 
 617  cleanup:
 618     if( NULL != path_to_check) {
 619         free(path_to_check);
 620         path_to_check = NULL;
 621     }
 622 
 623     return exit_status;
 624 }
 625 
 626 static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
 627 {
 628     int ret, exit_status = OPAL_SUCCESS;
 629     char *command = NULL;
 630     char *proc_file = NULL;
 631     char **loc_touch = NULL;
 632     char **loc_mkdir = NULL;
 633     int argc, i;
 634 
 635     if( 0 > prev_pid ) {
 636         opal_output(opal_restart_globals.output,
 637                     "Invalid PID (%d)\n",
 638                     prev_pid);
 639         exit_status = OPAL_ERROR;
 640         goto cleanup;
 641     }
 642 
 643     
 644 
 645 
 646 
 647     opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
 648     opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file);
 649 
 650     opal_output_verbose(5, opal_restart_globals.output,
 651                         "post_env_vars: Execute: <%s>", command);
 652 
 653     ret = system(command);
 654     if( 0 > ret) {
 655         exit_status = ret;
 656         goto cleanup;
 657     }
 658 
 659     
 660 
 661 
 662     if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
 663         opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
 664                        opal_restart_globals.snapshot_metadata,
 665                        snapshot->metadata_filename);
 666         exit_status = OPAL_ERROR;
 667         goto cleanup;
 668     }
 669     opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
 670     argc = opal_argv_count(loc_mkdir);
 671     for( i = 0; i < argc; ++i ) {
 672         if( NULL != command ) {
 673             free(command);
 674             command = NULL;
 675         }
 676         opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]);
 677 
 678         opal_output_verbose(5, opal_restart_globals.output,
 679                             "post_env_vars: Execute: <%s>", command);
 680 
 681         ret = system(command);
 682         if( 0 > ret) {
 683             exit_status = ret;
 684             goto cleanup;
 685         }
 686     }
 687     if( 0 < argc ) {
 688         system("sync ; sync");
 689     }
 690 
 691     
 692 
 693 
 694     opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
 695     argc = opal_argv_count(loc_touch);
 696     for( i = 0; i < argc; ++i ) {
 697         if( NULL != command ) {
 698             free(command);
 699             command = NULL;
 700         }
 701         opal_asprintf(&command, "touch %s", loc_touch[i]);
 702 
 703         opal_output_verbose(5, opal_restart_globals.output,
 704                             "post_env_vars: Execute: <%s>", command);
 705 
 706         ret = system(command);
 707         if( 0 > ret) {
 708             exit_status = ret;
 709             goto cleanup;
 710         }
 711     }
 712     if( 0 < argc ) {
 713         system("sync ; sync");
 714     }
 715 
 716  cleanup:
 717     if( NULL != command) {
 718         free(command);
 719         command = NULL;
 720     }
 721     if( NULL != proc_file) {
 722         free(proc_file);
 723         proc_file = NULL;
 724     }
 725     if( NULL != loc_mkdir ) {
 726         opal_argv_free(loc_mkdir);
 727         loc_mkdir = NULL;
 728     }
 729     if( NULL != loc_touch ) {
 730         opal_argv_free(loc_touch);
 731         loc_touch = NULL;
 732     }
 733 
 734     if( NULL != snapshot->metadata ) {
 735         fclose(snapshot->metadata);
 736         snapshot->metadata = NULL;
 737     }
 738 
 739     return exit_status;
 740 }