This source file includes following definitions.
- main
- initialize
- finalize
- parse_args
- notify_process_for_checkpoint
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 #include "opal_config.h"
32
33 #include <stdio.h>
34 #include <errno.h>
35 #include <stdlib.h>
36 #ifdef HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #ifdef HAVE_FCNTL_H
40 #include <fcntl.h>
41 #endif
42 #ifdef HAVE_SYS_TYPES_H
43 #include <sys/types.h>
44 #endif
45 #ifdef HAVE_SYS_STAT_H
46 #include <sys/stat.h>
47 #endif
48 #ifdef HAVE_SYS_WAIT_H
49 #include <sys/wait.h>
50 #endif
51 #include <string.h>
52 #include <signal.h>
53
54 #include "opal/constants.h"
55
56 #include "opal/util/cmd_line.h"
57 #include "opal/util/argv.h"
58 #include "opal/util/show_help.h"
59 #include "opal/util/opal_environ.h"
60 #include "opal/util/error.h"
61 #include "opal/util/output.h"
62 #include "opal/util/printf.h"
63 #include "opal/mca/base/base.h"
64
65 #include "opal/runtime/opal.h"
66 #include "opal/runtime/opal_cr.h"
67
68 #include "opal/mca/crs/crs.h"
69 #include "opal/mca/crs/base/base.h"
70
71
72
73
74
75
76
77
78 static int initialize(int argc, char *argv[]);
79 static int finalize(void);
80 static int parse_args(int argc, char *argv[]);
81 static int notify_process_for_checkpoint(pid_t pid, char **fname, int term,
82 opal_crs_state_type_t *state);
83
84
85
86
87 typedef struct {
88 bool help;
89 int pid;
90 bool term;
91 bool verbose;
92 bool quiet;
93 char *snapshot_name;
94 char *snapshot_loc;
95 int output;
96 } opal_checkpoint_globals_t;
97
98 opal_checkpoint_globals_t opal_checkpoint_globals;
99
100 opal_cmd_line_init_t cmd_line_opts[] = {
101 { NULL,
102 'h', NULL, "help",
103 0,
104 &opal_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
105 "This help message" },
106
107 { NULL,
108 'v', NULL, "verbose",
109 0,
110 &opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
111 "Be Verbose" },
112
113 { NULL,
114 'q', NULL, "quiet",
115 0,
116 &opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
117 "Be Super Quiet" },
118
119 { NULL,
120 '\0', NULL, "term",
121 0,
122 &opal_checkpoint_globals.term, OPAL_CMD_LINE_TYPE_BOOL,
123 "Terminate the application after checkpoint" },
124
125 { NULL,
126 'n', NULL, "name",
127 1,
128 &opal_checkpoint_globals.snapshot_name, OPAL_CMD_LINE_TYPE_STRING,
129 "Request a specific snapshot reference." },
130
131 { "crs_base_snapshot_dir",
132 'w', NULL, "where",
133 1,
134 &opal_checkpoint_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
135 "Where to place the checkpoint files. Note: You must remember this "
136 "location to pass into opal-restart, as it may not be able to find "
137 "the desired directory." },
138
139
140 { NULL, '\0', NULL, NULL, 0,
141 NULL, OPAL_CMD_LINE_TYPE_NULL,
142 NULL }
143 };
144
145 int
146 main(int argc, char *argv[])
147 {
148 int ret, exit_status = OPAL_SUCCESS;
149 char *fname = NULL;
150 opal_crs_state_type_t cr_state;
151
152
153
154
155 if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
156 exit_status = ret;
157 goto cleanup;
158 }
159
160
161
162
163 opal_output_verbose(10, opal_checkpoint_globals.output,
164 "opal_checkpoint: Checkpointing PID %d",
165 opal_checkpoint_globals.pid);
166 if( opal_checkpoint_globals.term ) {
167 opal_output_verbose(10, opal_checkpoint_globals.output,
168 "\tTerminating application after checkpoint");
169 }
170
171 ret = notify_process_for_checkpoint(opal_checkpoint_globals.pid,
172 &fname,
173 opal_checkpoint_globals.term,
174 &cr_state);
175 if (OPAL_SUCCESS != ret ||
176 cr_state == OPAL_CRS_ERROR) {
177 opal_show_help("help-opal-checkpoint.txt", "ckpt_failure", true,
178 opal_checkpoint_globals.pid, ret, cr_state);
179 exit_status = ret;
180 goto cleanup;
181 }
182
183 if( !opal_checkpoint_globals.quiet ) {
184 opal_output(opal_checkpoint_globals.output,
185 "Local Snapshot Reference = %s\n",
186 fname);
187 }
188
189 cleanup:
190
191
192
193 if (OPAL_SUCCESS != (ret = finalize())) {
194 return ret;
195 }
196
197 return exit_status;
198 }
199
200 static int initialize(int argc, char *argv[]) {
201 int ret, exit_status = OPAL_SUCCESS;
202 char * tmp_env_var = NULL;
203
204
205
206
207
208
209 if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
210 return ret;
211 }
212
213
214
215
216 if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
217 exit_status = ret;
218 goto cleanup;
219 }
220
221
222
223
224 if( opal_checkpoint_globals.verbose ) {
225 opal_checkpoint_globals.quiet = false;
226 opal_checkpoint_globals.output = opal_output_open(NULL);
227 opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
228 } else {
229 opal_checkpoint_globals.output = 0;
230 }
231
232
233
234
235
236
237 opal_cr_set_enabled(false);
238
239
240
241
242
243 (void) mca_base_var_env_name("crs", &tmp_env_var);
244 opal_setenv(tmp_env_var,
245 "none",
246 true, &environ);
247 free(tmp_env_var);
248 tmp_env_var = NULL;
249
250
251
252
253 if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
254 exit_status = ret;
255 goto cleanup;
256 }
257
258 cleanup:
259 return exit_status;
260 }
261
262 static int finalize(void) {
263 int ret = OPAL_SUCCESS;
264
265 if (OPAL_SUCCESS != (ret = opal_finalize())) {
266 return ret;
267 }
268
269 return OPAL_SUCCESS;
270 }
271
272 static int parse_args(int argc, char *argv[]) {
273 int i, ret, len;
274 opal_cmd_line_t cmd_line;
275 char **app_env = NULL, **global_env = NULL;
276 char * tmp_env_var = NULL;
277 char *argv0 = NULL;
278
279 memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));
280
281 opal_checkpoint_globals.snapshot_name = NULL;
282 opal_checkpoint_globals.snapshot_loc = NULL;
283
284
285 opal_cmd_line_create(&cmd_line, cmd_line_opts);
286 mca_base_open();
287 mca_base_cmd_line_setup(&cmd_line);
288 ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
289
290 if (OPAL_SUCCESS != ret) {
291 if (OPAL_ERR_SILENT != ret) {
292 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
293 opal_strerror(ret));
294 }
295 return 1;
296 }
297 if (opal_checkpoint_globals.help) {
298 char *str, *args = NULL;
299 args = opal_cmd_line_get_usage_msg(&cmd_line);
300 str = opal_show_help_string("help-opal-checkpoint.txt", "usage", true,
301 args);
302 if (NULL != str) {
303 printf("%s", str);
304 free(str);
305 }
306 free(args);
307
308 exit(0);
309 }
310
311
312
313
314 mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
315
316 len = opal_argv_count(app_env);
317 for(i = 0; i < len; ++i) {
318 putenv(app_env[i]);
319 }
320
321 len = opal_argv_count(global_env);
322 for(i = 0; i < len; ++i) {
323 putenv(global_env[i]);
324 }
325
326 (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
327 opal_setenv(tmp_env_var,
328 "1",
329 true, &environ);
330 free(tmp_env_var);
331 tmp_env_var = NULL;
332
333
334
335
336
337 if( NULL == opal_checkpoint_globals.snapshot_name )
338 opal_checkpoint_globals.snapshot_name = strdup("");
339 if( NULL == opal_checkpoint_globals.snapshot_loc ) {
340 opal_checkpoint_globals.snapshot_loc = strdup("");
341 }
342
343
344 argv0 = strdup(argv[0]);
345 opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
346
347 if (0 == argc) {
348 fprintf(stderr, "%s: Nothing to do\n", argv0);
349 fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
350 free(argv0);
351 return OPAL_ERROR;
352 }
353 free(argv0);
354
355 opal_checkpoint_globals.pid = atoi(argv[0]);
356 if ( 0 >= opal_checkpoint_globals.pid ) {
357 opal_show_help("help-opal-checkpoint.txt", "invalid_pid", true,
358 opal_checkpoint_globals.pid);
359 return OPAL_ERROR;
360 }
361
362 return OPAL_SUCCESS;
363 }
364
365 static int
366 notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_type_t *cr_state)
367 {
368 char *prog_named_pipe_r = NULL, *prog_named_pipe_w = NULL;
369 int prog_named_read_pipe_fd = -1, prog_named_write_pipe_fd = -1;
370 char *loc_fname = NULL, *tmp_pid = NULL;
371 unsigned char cmd;
372 int len, ret;
373 int exit_status = OPAL_SUCCESS;
374 int s, max_wait_time = 20;
375 ssize_t tmp_size = 0;
376 int value;
377
378
379 opal_asprintf(&tmp_pid, "%d", pid);
380
381
382 opal_asprintf(&prog_named_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
383 opal_asprintf(&prog_named_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
384
385
386
387
388 if( 0 != (ret = kill(pid, opal_cr_entry_point_signal) ) ) {
389 exit_status = ret;
390 goto cleanup;
391 }
392
393 opal_output_verbose(10, opal_checkpoint_globals.output,
394 "opal_checkpoint: Looking for Named Pipes (%s) (%s)\n",
395 prog_named_pipe_r, prog_named_pipe_w);
396
397 for( s = 0; s < max_wait_time; ++s) {
398
399
400
401 if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
402
403 if( !opal_checkpoint_globals.quiet &&
404 s >= max_wait_time - 5 ) {
405 opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
406 prog_named_pipe_r, ret, s, max_wait_time);
407 }
408 sleep(1);
409 continue;
410 }
411 else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
412
413 if( !opal_checkpoint_globals.quiet &&
414 s >= max_wait_time - 5 ) {
415 opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
416 prog_named_pipe_w, ret, s, max_wait_time);
417 }
418 sleep(1);
419 continue;
420 }
421 else {
422 break;
423 }
424 }
425 if( s == max_wait_time ) {
426
427
428
429
430
431
432
433
434 opal_show_help("help-opal-checkpoint.txt", "pid_does_not_exist", true,
435 opal_checkpoint_globals.pid, prog_named_pipe_r, prog_named_pipe_w);
436
437 *cr_state = OPAL_CRS_ERROR;
438
439 exit_status = OPAL_ERROR;
440 goto cleanup;
441 }
442
443
444
445
446
447
448
449
450
451
452
453
454 prog_named_write_pipe_fd = open(prog_named_pipe_w, O_WRONLY);
455 if(prog_named_write_pipe_fd < 0) {
456 opal_output(opal_checkpoint_globals.output,
457 "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
458 prog_named_pipe_w, prog_named_write_pipe_fd);
459 exit_status = OPAL_ERROR;
460 goto cleanup;
461 }
462
463 prog_named_read_pipe_fd = open(prog_named_pipe_r, O_RDWR);
464 if(prog_named_read_pipe_fd < 0) {
465 opal_output(opal_checkpoint_globals.output,
466 "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
467 prog_named_pipe_r, prog_named_read_pipe_fd);
468 exit_status = OPAL_ERROR;
469 goto cleanup;
470 }
471
472
473
474
475 len = 0;
476 if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
477 opal_output(opal_checkpoint_globals.output,
478 "opal_checkpoint: Error: Unable to write handshake to named pipe (%s). %d\n",
479 prog_named_pipe_w, ret);
480 exit_status = OPAL_ERROR;
481 goto cleanup;
482 }
483
484 if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &value, sizeof(int))) ) {
485 opal_output(opal_checkpoint_globals.output,
486 "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
487 prog_named_pipe_r, ret);
488 exit_status = OPAL_ERROR;
489 goto cleanup;
490 }
491
492
493 if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == value ) {
494 opal_show_help("help-opal-checkpoint.txt",
495 "ckpt:in_progress",
496 true,
497 opal_checkpoint_globals.pid);
498 exit_status = OPAL_ERROR;
499 goto cleanup;
500 }
501 else if( OPAL_CHECKPOINT_CMD_NULL == value ) {
502 opal_show_help("help-opal-checkpoint.txt",
503 "ckpt:req_null",
504 true,
505 opal_checkpoint_globals.pid);
506 exit_status = OPAL_ERROR;
507 goto cleanup;
508 }
509 else if ( OPAL_CHECKPOINT_CMD_ERROR == value ) {
510 opal_show_help("help-opal-checkpoint.txt",
511 "ckpt:req_error",
512 true,
513 opal_checkpoint_globals.pid);
514 exit_status = OPAL_ERROR;
515 goto cleanup;
516 }
517
518
519
520
521
522 cmd = OPAL_CR_CHECKPOINT;
523
524 if( sizeof(cmd) != (ret = write(prog_named_write_pipe_fd, &cmd, sizeof(cmd))) ) {
525 opal_output(opal_checkpoint_globals.output,
526 "opal_checkpoint: Error: Unable to write CHECKPOINT Command to named pipe (%s). %d\n",
527 prog_named_pipe_w, ret);
528 exit_status = OPAL_ERROR;
529 goto cleanup;
530 }
531
532
533 if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &pid, sizeof(int))) ) {
534 opal_output(opal_checkpoint_globals.output,
535 "opal_checkpoint: Error: Unable to write pid (%d) to named pipe (%s). %d\n",
536 pid, prog_named_pipe_w, ret);
537 exit_status = OPAL_ERROR;
538 goto cleanup;
539 }
540
541 if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &term, sizeof(int))) ) {
542 opal_output(opal_checkpoint_globals.output,
543 "opal_checkpoint: Error: Unable to write term (%d) to named pipe (%s), %d\n",
544 term, prog_named_pipe_w, ret);
545 exit_status = OPAL_ERROR;
546 goto cleanup;
547 }
548
549
550 len = strlen(opal_checkpoint_globals.snapshot_name) + 1;
551 if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
552 opal_output(opal_checkpoint_globals.output,
553 "opal_checkpoint: Error: Unable to write snapshot name len (%d) to named pipe (%s). %d\n",
554 len, prog_named_pipe_w, ret);
555 exit_status = OPAL_ERROR;
556 goto cleanup;
557 }
558
559 tmp_size = sizeof(char) * len;
560 if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_name), (sizeof(char) * len))) ) {
561 opal_output(opal_checkpoint_globals.output,
562 "opal_checkpoint: Error: Unable to write snapshot name (%s) to named pipe (%s). %d\n",
563 opal_checkpoint_globals.snapshot_name, prog_named_pipe_w, ret);
564 exit_status = OPAL_ERROR;
565 goto cleanup;
566 }
567
568
569 len = strlen(opal_checkpoint_globals.snapshot_loc) + 1;
570 if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
571 opal_output(opal_checkpoint_globals.output,
572 "opal_checkpoint: Error: Unable to write snapshot location len (%d) to named pipe (%s). %d\n",
573 len, prog_named_pipe_w, ret);
574 exit_status = OPAL_ERROR;
575 goto cleanup;
576 }
577
578 tmp_size = sizeof(char) * len;
579 if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_loc), (sizeof(char) * len))) ) {
580 opal_output(opal_checkpoint_globals.output,
581 "opal_checkpoint: Error: Unable to write snapshot location (%s) to named pipe (%s). %d\n",
582 opal_checkpoint_globals.snapshot_loc, prog_named_pipe_w, ret);
583 exit_status = OPAL_ERROR;
584 goto cleanup;
585 }
586
587
588
589
590
591 if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
592 opal_output(opal_checkpoint_globals.output,
593 "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
594 prog_named_pipe_r, ret);
595 exit_status = OPAL_ERROR;
596 goto cleanup;
597 }
598
599 if(len > 0) {
600 loc_fname = (char *) malloc(sizeof(char) * len);
601 if( (ssize_t)(sizeof(char) * len) != (ret = read(prog_named_read_pipe_fd, loc_fname, (sizeof(char) * len))) ) {
602 opal_output(opal_checkpoint_globals.output,
603 "opal_checkpoint: Error: Unable to read filename from named pipe (%s). %d\n",
604 prog_named_pipe_w, ret);
605 exit_status = OPAL_ERROR;
606 goto cleanup;
607 }
608 }
609
610 *fname = strdup(loc_fname);
611 if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &cr_state, sizeof(int))) ) {
612 opal_output(opal_checkpoint_globals.output,
613 "opal_checkpoint: Error: Unable to read state from named pipe (%s). %d\n",
614 prog_named_pipe_r, ret);
615 exit_status = OPAL_ERROR;
616 goto cleanup;
617 }
618
619 cleanup:
620
621
622
623 close(prog_named_write_pipe_fd);
624 close(prog_named_read_pipe_fd);
625
626 if( NULL != tmp_pid)
627 free(tmp_pid);
628 if( NULL != prog_named_pipe_r)
629 free(prog_named_pipe_r);
630 if( NULL != prog_named_pipe_w)
631 free(prog_named_pipe_w);
632
633 return exit_status;
634 }