This source file includes following definitions.
- main
- initialize
- finalize
- parse_args
- notify_process_for_checkpoint
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 #include "opal_config.h"
  32 
  33 #include <stdio.h>
  34 #include <errno.h>
  35 #include <stdlib.h>
  36 #ifdef HAVE_UNISTD_H
  37 #include <unistd.h>
  38 #endif  
  39 #ifdef HAVE_FCNTL_H
  40 #include <fcntl.h>
  41 #endif  
  42 #ifdef HAVE_SYS_TYPES_H
  43 #include <sys/types.h>
  44 #endif  
  45 #ifdef HAVE_SYS_STAT_H
  46 #include <sys/stat.h>  
  47 #endif  
  48 #ifdef HAVE_SYS_WAIT_H
  49 #include <sys/wait.h>
  50 #endif
  51 #include <string.h>
  52 #include <signal.h>
  53 
  54 #include "opal/constants.h"
  55 
  56 #include "opal/util/cmd_line.h"
  57 #include "opal/util/argv.h"
  58 #include "opal/util/show_help.h"
  59 #include "opal/util/opal_environ.h"
  60 #include "opal/util/error.h"
  61 #include "opal/util/output.h"
  62 #include "opal/util/printf.h"
  63 #include "opal/mca/base/base.h"
  64 
  65 #include "opal/runtime/opal.h"
  66 #include "opal/runtime/opal_cr.h"
  67 
  68 #include "opal/mca/crs/crs.h"
  69 #include "opal/mca/crs/base/base.h"
  70 
  71 
  72 
  73 
  74 
  75 
  76 
  77 
  78 static int initialize(int argc, char *argv[]);
  79 static int finalize(void);
  80 static int parse_args(int argc, char *argv[]);
  81 static int notify_process_for_checkpoint(pid_t pid, char **fname, int term,
  82                                          opal_crs_state_type_t *state);
  83 
  84 
  85 
  86 
  87 typedef struct {
  88     bool help;
  89     int pid;
  90     bool term;
  91     bool verbose;
  92     bool quiet;
  93     char *snapshot_name;
  94     char *snapshot_loc;
  95     int output;
  96 } opal_checkpoint_globals_t;
  97 
  98 opal_checkpoint_globals_t opal_checkpoint_globals;
  99 
 100 opal_cmd_line_init_t cmd_line_opts[] = {
 101     { NULL,
 102       'h', NULL, "help",
 103       0,
 104       &opal_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
 105       "This help message" },
 106 
 107     { NULL,
 108       'v', NULL, "verbose",
 109       0,
 110       &opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
 111       "Be Verbose" },
 112 
 113     { NULL,
 114       'q', NULL, "quiet",
 115       0,
 116       &opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
 117       "Be Super Quiet" },
 118 
 119     { NULL,
 120       '\0', NULL, "term",
 121       0,
 122       &opal_checkpoint_globals.term, OPAL_CMD_LINE_TYPE_BOOL,
 123       "Terminate the application after checkpoint" },
 124 
 125     { NULL,
 126       'n', NULL, "name",
 127       1,
 128       &opal_checkpoint_globals.snapshot_name, OPAL_CMD_LINE_TYPE_STRING,
 129       "Request a specific snapshot reference." },
 130 
 131     { "crs_base_snapshot_dir",
 132       'w', NULL, "where",
 133       1,
 134       &opal_checkpoint_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
 135       "Where to place the checkpoint files. Note: You must remember this "
 136       "location to pass into opal-restart, as it may not be able to find "
 137       "the desired directory." },
 138 
 139     
 140     { NULL, '\0', NULL, NULL, 0,
 141       NULL, OPAL_CMD_LINE_TYPE_NULL,
 142       NULL }
 143 };
 144 
 145 int
 146 main(int argc, char *argv[])
 147 {
 148     int ret, exit_status = OPAL_SUCCESS;
 149     char *fname = NULL;
 150     opal_crs_state_type_t cr_state;
 151 
 152     
 153 
 154 
 155     if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
 156         exit_status = ret;
 157         goto cleanup;
 158     }
 159 
 160     
 161 
 162 
 163     opal_output_verbose(10, opal_checkpoint_globals.output,
 164                         "opal_checkpoint: Checkpointing PID %d",
 165                         opal_checkpoint_globals.pid);
 166     if( opal_checkpoint_globals.term ) {
 167         opal_output_verbose(10, opal_checkpoint_globals.output,
 168                             "\tTerminating application after checkpoint");
 169     }
 170 
 171     ret = notify_process_for_checkpoint(opal_checkpoint_globals.pid,
 172                                         &fname,
 173                                         opal_checkpoint_globals.term,
 174                                         &cr_state);
 175     if (OPAL_SUCCESS != ret ||
 176         cr_state == OPAL_CRS_ERROR) {
 177         opal_show_help("help-opal-checkpoint.txt", "ckpt_failure", true,
 178                        opal_checkpoint_globals.pid, ret, cr_state);
 179         exit_status = ret;
 180         goto cleanup;
 181     }
 182 
 183     if( !opal_checkpoint_globals.quiet ) {
 184         opal_output(opal_checkpoint_globals.output,
 185                     "Local Snapshot Reference = %s\n",
 186                     fname);
 187     }
 188 
 189  cleanup:
 190     
 191 
 192 
 193     if (OPAL_SUCCESS != (ret = finalize())) {
 194         return ret;
 195     }
 196 
 197     return exit_status;
 198 }
 199 
 200 static int initialize(int argc, char *argv[]) {
 201     int ret, exit_status = OPAL_SUCCESS;
 202     char * tmp_env_var = NULL;
 203 
 204     
 205 
 206 
 207 
 208 
 209     if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
 210         return ret;
 211     }
 212 
 213     
 214 
 215 
 216     if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
 217         exit_status = ret;
 218         goto cleanup;
 219     }
 220 
 221     
 222 
 223 
 224     if( opal_checkpoint_globals.verbose ) {
 225         opal_checkpoint_globals.quiet = false; 
 226         opal_checkpoint_globals.output = opal_output_open(NULL);
 227         opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
 228     } else {
 229         opal_checkpoint_globals.output = 0; 
 230     }
 231 
 232     
 233 
 234 
 235 
 236 
 237     opal_cr_set_enabled(false);
 238 
 239     
 240 
 241 
 242 
 243     (void) mca_base_var_env_name("crs", &tmp_env_var);
 244     opal_setenv(tmp_env_var,
 245                 "none",
 246                 true, &environ);
 247     free(tmp_env_var);
 248     tmp_env_var = NULL;
 249 
 250     
 251 
 252 
 253     if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
 254         exit_status = ret;
 255         goto cleanup;
 256     }
 257 
 258  cleanup:
 259     return exit_status;
 260 }
 261 
 262 static int finalize(void) {
 263     int ret = OPAL_SUCCESS;
 264 
 265     if (OPAL_SUCCESS != (ret = opal_finalize())) {
 266         return ret;
 267     }
 268 
 269     return OPAL_SUCCESS;
 270 }
 271 
 272 static int parse_args(int argc, char *argv[]) {
 273     int i, ret, len;
 274     opal_cmd_line_t cmd_line;
 275     char **app_env = NULL, **global_env = NULL;
 276     char * tmp_env_var = NULL;
 277     char *argv0 = NULL;
 278 
 279     memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));
 280 
 281     opal_checkpoint_globals.snapshot_name = NULL;
 282     opal_checkpoint_globals.snapshot_loc  = NULL;
 283 
 284     
 285     opal_cmd_line_create(&cmd_line, cmd_line_opts);
 286     mca_base_open();
 287     mca_base_cmd_line_setup(&cmd_line);
 288     ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
 289 
 290     if (OPAL_SUCCESS != ret) {
 291         if (OPAL_ERR_SILENT != ret) {
 292             fprintf(stderr, "%s: command line error (%s)\n", argv[0],
 293                     opal_strerror(ret));
 294         }
 295         return 1;
 296     }
 297     if (opal_checkpoint_globals.help) {
 298         char *str, *args = NULL;
 299         args = opal_cmd_line_get_usage_msg(&cmd_line);
 300         str = opal_show_help_string("help-opal-checkpoint.txt", "usage", true,
 301                                     args);
 302         if (NULL != str) {
 303             printf("%s", str);
 304             free(str);
 305         }
 306         free(args);
 307         
 308         exit(0);
 309     }
 310 
 311     
 312 
 313 
 314     mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
 315 
 316     len = opal_argv_count(app_env);
 317     for(i = 0; i < len; ++i) {
 318         putenv(app_env[i]);
 319     }
 320 
 321     len = opal_argv_count(global_env);
 322     for(i = 0; i < len; ++i) {
 323         putenv(global_env[i]);
 324     }
 325 
 326     (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
 327     opal_setenv(tmp_env_var,
 328                 "1",
 329                 true, &environ);
 330     free(tmp_env_var);
 331     tmp_env_var = NULL;
 332 
 333     
 334 
 335 
 336 
 337     if( NULL == opal_checkpoint_globals.snapshot_name )
 338         opal_checkpoint_globals.snapshot_name = strdup("");
 339     if( NULL == opal_checkpoint_globals.snapshot_loc ) {
 340         opal_checkpoint_globals.snapshot_loc = strdup("");
 341     }
 342 
 343     
 344     argv0 = strdup(argv[0]);
 345     opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
 346 
 347     if (0 == argc) {
 348         fprintf(stderr, "%s: Nothing to do\n", argv0);
 349         fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
 350         free(argv0);
 351         return OPAL_ERROR;
 352     }
 353     free(argv0);
 354 
 355     opal_checkpoint_globals.pid = atoi(argv[0]);
 356     if ( 0 >= opal_checkpoint_globals.pid ) {
 357         opal_show_help("help-opal-checkpoint.txt", "invalid_pid", true,
 358                        opal_checkpoint_globals.pid);
 359         return OPAL_ERROR;
 360     }
 361 
 362     return OPAL_SUCCESS;
 363 }
 364 
 365 static int
 366 notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_type_t *cr_state)
 367 {
 368     char *prog_named_pipe_r = NULL, *prog_named_pipe_w = NULL;
 369     int   prog_named_read_pipe_fd = -1, prog_named_write_pipe_fd = -1;
 370     char *loc_fname = NULL, *tmp_pid = NULL;
 371     unsigned char cmd;
 372     int len, ret;
 373     int exit_status = OPAL_SUCCESS;
 374     int s, max_wait_time = 20; 
 375     ssize_t tmp_size = 0;
 376     int value;
 377 
 378     
 379     opal_asprintf(&tmp_pid, "%d", pid);
 380 
 381     
 382     opal_asprintf(&prog_named_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
 383     opal_asprintf(&prog_named_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
 384 
 385     
 386 
 387 
 388     if( 0 != (ret = kill(pid, opal_cr_entry_point_signal) ) ) {
 389         exit_status = ret;
 390         goto cleanup;
 391     }
 392 
 393     opal_output_verbose(10, opal_checkpoint_globals.output,
 394                         "opal_checkpoint: Looking for Named Pipes (%s) (%s)\n",
 395                         prog_named_pipe_r, prog_named_pipe_w);
 396 
 397     for( s = 0; s < max_wait_time; ++s) {
 398         
 399 
 400 
 401         if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
 402             
 403             if( !opal_checkpoint_globals.quiet &&
 404                 s >= max_wait_time - 5 ) {
 405                 opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
 406                             prog_named_pipe_r, ret, s, max_wait_time);
 407             }
 408             sleep(1);
 409             continue;
 410         }
 411         else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
 412             
 413             if( !opal_checkpoint_globals.quiet &&
 414                 s >= max_wait_time - 5 ) {
 415                 opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
 416                             prog_named_pipe_w, ret, s, max_wait_time);
 417             }
 418             sleep(1);
 419             continue;
 420         }
 421         else {
 422             break;
 423         }
 424     }
 425     if( s == max_wait_time ) {
 426         
 427 
 428 
 429 
 430 
 431 
 432 
 433 
 434         opal_show_help("help-opal-checkpoint.txt", "pid_does_not_exist", true,
 435                        opal_checkpoint_globals.pid, prog_named_pipe_r, prog_named_pipe_w);
 436 
 437         *cr_state = OPAL_CRS_ERROR;
 438 
 439         exit_status = OPAL_ERROR;
 440         goto cleanup;
 441     }
 442 
 443     
 444 
 445     
 446 
 447 
 448 
 449 
 450 
 451 
 452 
 453 
 454     prog_named_write_pipe_fd = open(prog_named_pipe_w, O_WRONLY);
 455     if(prog_named_write_pipe_fd < 0) {
 456         opal_output(opal_checkpoint_globals.output,
 457                     "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
 458                     prog_named_pipe_w, prog_named_write_pipe_fd);
 459         exit_status = OPAL_ERROR;
 460         goto cleanup;
 461     }
 462 
 463     prog_named_read_pipe_fd = open(prog_named_pipe_r, O_RDWR);
 464     if(prog_named_read_pipe_fd < 0) {
 465         opal_output(opal_checkpoint_globals.output,
 466                     "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
 467                     prog_named_pipe_r, prog_named_read_pipe_fd);
 468         exit_status = OPAL_ERROR;
 469         goto cleanup;
 470     }
 471 
 472     
 473 
 474 
 475     len = 0;
 476     if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
 477         opal_output(opal_checkpoint_globals.output,
 478                     "opal_checkpoint: Error: Unable to write handshake to named pipe (%s). %d\n",
 479                     prog_named_pipe_w, ret);
 480         exit_status = OPAL_ERROR;
 481         goto cleanup;
 482     }
 483 
 484     if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &value, sizeof(int))) ) {
 485         opal_output(opal_checkpoint_globals.output,
 486                     "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
 487                     prog_named_pipe_r, ret);
 488         exit_status = OPAL_ERROR;
 489         goto cleanup;
 490     }
 491 
 492     
 493     if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == value ) {
 494         opal_show_help("help-opal-checkpoint.txt",
 495                        "ckpt:in_progress",
 496                        true,
 497                        opal_checkpoint_globals.pid);
 498         exit_status = OPAL_ERROR;
 499         goto cleanup;
 500     }
 501     else if( OPAL_CHECKPOINT_CMD_NULL == value ) {
 502         opal_show_help("help-opal-checkpoint.txt",
 503                        "ckpt:req_null",
 504                        true,
 505                        opal_checkpoint_globals.pid);
 506         exit_status = OPAL_ERROR;
 507         goto cleanup;
 508     }
 509     else if ( OPAL_CHECKPOINT_CMD_ERROR == value ) {
 510         opal_show_help("help-opal-checkpoint.txt",
 511                        "ckpt:req_error",
 512                        true,
 513                        opal_checkpoint_globals.pid);
 514         exit_status = OPAL_ERROR;
 515         goto cleanup;
 516     }
 517 
 518     
 519 
 520 
 521 
 522     cmd = OPAL_CR_CHECKPOINT;
 523     
 524     if( sizeof(cmd) != (ret = write(prog_named_write_pipe_fd, &cmd, sizeof(cmd))) ) {
 525         opal_output(opal_checkpoint_globals.output,
 526                     "opal_checkpoint: Error: Unable to write CHECKPOINT Command to named pipe (%s). %d\n",
 527                     prog_named_pipe_w, ret);
 528         exit_status = OPAL_ERROR;
 529         goto cleanup;
 530     }
 531 
 532     
 533     if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &pid, sizeof(int))) ) {
 534         opal_output(opal_checkpoint_globals.output,
 535                     "opal_checkpoint: Error: Unable to write pid (%d) to named pipe (%s). %d\n",
 536                     pid, prog_named_pipe_w, ret);
 537         exit_status = OPAL_ERROR;
 538         goto cleanup;
 539     }
 540 
 541     if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &term, sizeof(int))) ) {
 542         opal_output(opal_checkpoint_globals.output,
 543                     "opal_checkpoint: Error: Unable to write term (%d) to named pipe (%s), %d\n",
 544                     term, prog_named_pipe_w, ret);
 545         exit_status = OPAL_ERROR;
 546         goto cleanup;
 547     }
 548 
 549     
 550     len = strlen(opal_checkpoint_globals.snapshot_name) + 1;
 551     if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
 552         opal_output(opal_checkpoint_globals.output,
 553                     "opal_checkpoint: Error: Unable to write snapshot name len (%d) to named pipe (%s). %d\n",
 554                     len, prog_named_pipe_w, ret);
 555         exit_status = OPAL_ERROR;
 556         goto cleanup;
 557     }
 558 
 559     tmp_size = sizeof(char) * len;
 560     if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_name), (sizeof(char) * len))) ) {
 561         opal_output(opal_checkpoint_globals.output,
 562                     "opal_checkpoint: Error: Unable to write snapshot name (%s) to named pipe (%s). %d\n",
 563                     opal_checkpoint_globals.snapshot_name, prog_named_pipe_w, ret);
 564         exit_status = OPAL_ERROR;
 565         goto cleanup;
 566     }
 567 
 568     
 569     len = strlen(opal_checkpoint_globals.snapshot_loc) + 1;
 570     if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
 571         opal_output(opal_checkpoint_globals.output,
 572                     "opal_checkpoint: Error: Unable to write snapshot location len (%d) to named pipe (%s). %d\n",
 573                     len, prog_named_pipe_w, ret);
 574         exit_status = OPAL_ERROR;
 575         goto cleanup;
 576     }
 577 
 578     tmp_size = sizeof(char) * len;
 579     if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_loc), (sizeof(char) * len))) ) {
 580         opal_output(opal_checkpoint_globals.output,
 581                     "opal_checkpoint: Error: Unable to write snapshot location (%s) to named pipe (%s). %d\n",
 582                     opal_checkpoint_globals.snapshot_loc, prog_named_pipe_w, ret);
 583         exit_status = OPAL_ERROR;
 584         goto cleanup;
 585     }
 586 
 587     
 588 
 589 
 590 
 591     if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
 592         opal_output(opal_checkpoint_globals.output,
 593                     "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
 594                     prog_named_pipe_r, ret);
 595         exit_status = OPAL_ERROR;
 596         goto cleanup;
 597     }
 598 
 599     if(len > 0) {
 600         loc_fname = (char *) malloc(sizeof(char) * len);
 601         if( (ssize_t)(sizeof(char) * len) != (ret = read(prog_named_read_pipe_fd, loc_fname, (sizeof(char) * len))) ) {
 602             opal_output(opal_checkpoint_globals.output,
 603                         "opal_checkpoint: Error: Unable to read filename from named pipe (%s). %d\n",
 604                         prog_named_pipe_w, ret);
 605             exit_status = OPAL_ERROR;
 606             goto cleanup;
 607         }
 608     }
 609 
 610     *fname = strdup(loc_fname);
 611     if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &cr_state, sizeof(int))) ) {
 612         opal_output(opal_checkpoint_globals.output,
 613                     "opal_checkpoint: Error: Unable to read state from named pipe (%s). %d\n",
 614                     prog_named_pipe_r, ret);
 615         exit_status = OPAL_ERROR;
 616         goto cleanup;
 617     }
 618 
 619  cleanup:
 620     
 621 
 622 
 623     close(prog_named_write_pipe_fd);
 624     close(prog_named_read_pipe_fd);
 625 
 626     if( NULL != tmp_pid)
 627         free(tmp_pid);
 628     if( NULL != prog_named_pipe_r)
 629         free(prog_named_pipe_r);
 630     if( NULL != prog_named_pipe_w)
 631         free(prog_named_pipe_w);
 632 
 633     return exit_status;
 634 }