This source file includes following definitions.
- main
- initialize
- finalize
- parse_args
- check_file
- post_env_vars
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 #include "opal_config.h"
36
37 #include <stdio.h>
38 #include <errno.h>
39 #ifdef HAVE_UNISTD_H
40 #include <unistd.h>
41 #endif
42 #include <stdlib.h>
43 #ifdef HAVE_SYS_STAT_H
44 #include <sys/stat.h>
45 #endif
46 #ifdef HAVE_FCNTL_H
47 #include <fcntl.h>
48 #endif
49 #ifdef HAVE_SYS_TYPES_H
50 #include <sys/types.h>
51 #endif
52 #ifdef HAVE_SYS_WAIT_H
53 #include <sys/wait.h>
54 #endif
55 #include <string.h>
56
57 #include "opal/constants.h"
58
59 #include "opal/util/cmd_line.h"
60 #include "opal/util/argv.h"
61 #include "opal/util/show_help.h"
62 #include "opal/util/output.h"
63 #include "opal/util/opal_environ.h"
64 #include "opal/util/error.h"
65 #include "opal/util/basename.h"
66 #include "opal/util/printf.h"
67 #include "opal/mca/base/base.h"
68
69 #include "opal/runtime/opal.h"
70 #include "opal/runtime/opal_cr.h"
71
72 #include "opal/mca/crs/crs.h"
73 #include "opal/mca/crs/base/base.h"
74
75 #include "opal/mca/compress/compress.h"
76 #include "opal/mca/compress/base/base.h"
77
78
79
80
81 static int initialize(int argc, char *argv[]);
82 static int finalize(void);
83 static int parse_args(int argc, char *argv[]);
84 static int check_file(void);
85 static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot);
86
87
88
89
90 static char *expected_crs_comp = NULL;
91
92 typedef struct {
93 bool help;
94 bool verbose;
95 char *snapshot_ref;
96 char *snapshot_loc;
97 char *snapshot_metadata;
98 char *snapshot_cache;
99 char *snapshot_compress;
100 char *snapshot_compress_postfix;
101 int output;
102 } opal_restart_globals_t;
103
104 opal_restart_globals_t opal_restart_globals;
105
106 opal_cmd_line_init_t cmd_line_opts[] = {
107 { NULL,
108 'h', NULL, "help",
109 0,
110 &opal_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
111 "This help message" },
112
113 { NULL,
114 'v', NULL, "verbose",
115 0,
116 &opal_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
117 "Be Verbose" },
118
119 { NULL,
120 'l', NULL, "location",
121 1,
122 &opal_restart_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
123 "Full path to the location of the local snapshot."},
124
125 { NULL,
126 'm', NULL, "metadata",
127 1,
128 &opal_restart_globals.snapshot_metadata, OPAL_CMD_LINE_TYPE_STRING,
129 "Relative path (with respect to --location) to the metadata file."},
130
131 { NULL,
132 'r', NULL, "reference",
133 1,
134 &opal_restart_globals.snapshot_ref, OPAL_CMD_LINE_TYPE_STRING,
135 "Local snapshot reference."},
136
137 { NULL,
138 'c', NULL, "cache",
139 1,
140 &opal_restart_globals.snapshot_cache, OPAL_CMD_LINE_TYPE_STRING,
141 "Possible local cache of the snapshot reference."},
142
143 { NULL,
144 'd', NULL, "decompress",
145 1,
146 &opal_restart_globals.snapshot_compress, OPAL_CMD_LINE_TYPE_STRING,
147 "Decompression component to use."},
148
149 { NULL,
150 'p', NULL, "decompress_postfix",
151 1,
152 &opal_restart_globals.snapshot_compress_postfix, OPAL_CMD_LINE_TYPE_STRING,
153 "Decompression component postfix."},
154
155
156 { NULL,
157 '\0', NULL, NULL,
158 0,
159 NULL, OPAL_CMD_LINE_TYPE_NULL,
160 NULL }
161 };
162
163 int
164 main(int argc, char *argv[])
165 {
166 int ret, exit_status = OPAL_SUCCESS;
167 int child_pid;
168 int prev_pid = 0;
169 int idx;
170 opal_crs_base_snapshot_t *snapshot = NULL;
171 char * tmp_env_var = NULL;
172 bool select = false;
173
174
175
176
177 if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
178 exit_status = ret;
179 goto cleanup;
180 }
181
182
183
184
185 if( OPAL_SUCCESS != (ret = check_file() )) {
186 opal_show_help("help-opal-restart.txt", "invalid_filename", true,
187 opal_restart_globals.snapshot_ref);
188 exit_status = ret;
189 goto cleanup;
190 }
191
192
193 idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");
194
195 if (0 > idx) {
196 opal_output(opal_restart_globals.output,
197 "MCA variable opal_crs_base_do_not_select not found\n");
198 exit_status = OPAL_ERROR;
199 goto cleanup;
200 }
201
202 ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
203 if (OPAL_SUCCESS != ret) {
204 exit_status = ret;
205 goto cleanup;
206 }
207
208
209
210
211 if(NULL == expected_crs_comp) {
212 char * full_metadata_path = NULL;
213 FILE * metadata = NULL;
214
215 opal_asprintf(&full_metadata_path, "%s/%s/%s",
216 opal_restart_globals.snapshot_loc,
217 opal_restart_globals.snapshot_ref,
218 opal_restart_globals.snapshot_metadata);
219 if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
220 opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
221 opal_restart_globals.snapshot_metadata,
222 full_metadata_path);
223 exit_status = OPAL_ERROR;
224 goto cleanup;
225 }
226 if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
227 &expected_crs_comp,
228 &prev_pid)) ) {
229 opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
230 opal_restart_globals.snapshot_metadata,
231 full_metadata_path);
232 exit_status = ret;
233 goto cleanup;
234 }
235
236 free(full_metadata_path);
237 full_metadata_path = NULL;
238
239 fclose(metadata);
240 metadata = NULL;
241 }
242
243 opal_output_verbose(10, opal_restart_globals.output,
244 "Restart Expects checkpointer: (%s)",
245 expected_crs_comp);
246
247 (void) mca_base_var_env_name("crs", &tmp_env_var);
248 opal_setenv(tmp_env_var,
249 expected_crs_comp,
250 true, &environ);
251 free(tmp_env_var);
252 tmp_env_var = NULL;
253
254
255
256
257
258
259 if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
260 opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
261 "crs", ret);
262 exit_status = ret;
263 goto cleanup;
264 }
265
266 if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
267 opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
268 expected_crs_comp, ret);
269 exit_status = ret;
270 goto cleanup;
271 }
272
273
274
275
276 if(NULL == expected_crs_comp ||
277 0 != strncmp(expected_crs_comp,
278 opal_crs_base_selected_component.base_version.mca_component_name,
279 strlen(expected_crs_comp)) ) {
280 opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
281 true,
282 expected_crs_comp,
283 opal_crs_base_selected_component.base_version.mca_component_name,
284 ret);
285 exit_status = ret;
286 goto cleanup;
287 }
288
289
290
291
292 opal_output_verbose(10, opal_restart_globals.output,
293 "Restarting from file (%s)\n",
294 opal_restart_globals.snapshot_ref);
295
296 snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
297 snapshot->cold_start = true;
298 opal_asprintf(&(snapshot->snapshot_directory), "%s/%s",
299 opal_restart_globals.snapshot_loc,
300 opal_restart_globals.snapshot_ref);
301 opal_asprintf(&(snapshot->metadata_filename), "%s/%s",
302 snapshot->snapshot_directory,
303 opal_restart_globals.snapshot_metadata);
304
305
306
307
308
309
310
311 if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
312 exit_status = ret;
313 goto cleanup;
314 }
315
316
317
318
319 ret = opal_crs.crs_restart(snapshot,
320 false,
321 &child_pid);
322
323 if (OPAL_SUCCESS != ret) {
324 opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
325 opal_restart_globals.snapshot_ref,
326 ret,
327 opal_crs_base_selected_component.base_version.mca_component_name);
328 exit_status = ret;
329 goto cleanup;
330 }
331
332
333
334
335
336 cleanup:
337 if (OPAL_SUCCESS != (ret = finalize())) {
338 return ret;
339 }
340
341 if(NULL != snapshot )
342 OBJ_DESTRUCT(snapshot);
343
344 return exit_status;
345 }
346
347 static int initialize(int argc, char *argv[])
348 {
349 int ret, exit_status = OPAL_SUCCESS;
350 char * tmp_env_var = NULL;
351
352
353
354
355
356
357 if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
358 return ret;
359 }
360
361
362
363
364 if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
365 exit_status = ret;
366 goto cleanup;
367 }
368
369
370
371
372 if( opal_restart_globals.verbose ) {
373 opal_restart_globals.output = opal_output_open(NULL);
374 opal_output_set_verbosity(opal_restart_globals.output, 10);
375 } else {
376 opal_restart_globals.output = 0;
377 }
378
379
380
381
382
383 (void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var);
384 opal_setenv(tmp_env_var,
385 "1",
386 true, &environ);
387 free(tmp_env_var);
388 tmp_env_var = NULL;
389
390
391
392
393 if( NULL != opal_restart_globals.snapshot_compress ) {
394 (void) mca_base_var_env_name("compress", &tmp_env_var);
395 opal_setenv(tmp_env_var,
396 opal_restart_globals.snapshot_compress,
397 true, &environ);
398 free(tmp_env_var);
399 tmp_env_var = NULL;
400 }
401
402
403
404
405 if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
406 exit_status = ret;
407 goto cleanup;
408 }
409
410
411
412
413 if( NULL != opal_restart_globals.snapshot_compress ) {
414 char * zip_dir = NULL;
415 char * tmp_str = NULL;
416
417
418
419
420
421 (void) mca_base_var_env_name("compress", &tmp_env_var);
422 opal_unsetenv(tmp_env_var, &environ);
423 free(tmp_env_var);
424 tmp_env_var = NULL;
425
426 opal_asprintf(&zip_dir, "%s/%s%s",
427 opal_restart_globals.snapshot_loc,
428 opal_restart_globals.snapshot_ref,
429 opal_restart_globals.snapshot_compress_postfix);
430
431 if (0 > (ret = access(zip_dir, F_OK)) ) {
432 opal_output(opal_restart_globals.output,
433 "Error: Unable to access the file [%s]!",
434 zip_dir);
435 exit_status = OPAL_ERROR;
436 goto cleanup;
437 }
438
439 opal_output_verbose(10, opal_restart_globals.output,
440 "Decompressing (%s)",
441 zip_dir);
442
443 opal_compress.decompress(zip_dir, &tmp_str);
444
445 if( NULL != zip_dir ) {
446 free(zip_dir);
447 zip_dir = NULL;
448 }
449 if( NULL != tmp_str ) {
450 free(tmp_str);
451 tmp_str = NULL;
452 }
453 }
454
455
456
457
458 if( NULL != opal_restart_globals.snapshot_cache ) {
459 if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) {
460 opal_output_verbose(10, opal_restart_globals.output,
461 "Using the cached snapshot (%s) instead of (%s)",
462 opal_restart_globals.snapshot_cache,
463 opal_restart_globals.snapshot_loc);
464 if( NULL != opal_restart_globals.snapshot_loc ) {
465 free(opal_restart_globals.snapshot_loc);
466 opal_restart_globals.snapshot_loc = NULL;
467 }
468 opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache);
469 } else {
470 opal_show_help("help-opal-restart.txt", "cache_not_avail", true,
471 opal_restart_globals.snapshot_cache,
472 opal_restart_globals.snapshot_loc);
473 }
474 }
475
476
477
478
479 opal_cr_is_tool = true;
480
481 cleanup:
482 return exit_status;
483 }
484
485 static int finalize(void)
486 {
487 #if 0
488 int ret;
489
490
491
492
493
494
495
496
497
498 if (OPAL_SUCCESS != (ret = opal_finalize())) {
499 return ret;
500 }
501 #endif
502
503 return OPAL_SUCCESS;
504 }
505
506 static int parse_args(int argc, char *argv[])
507 {
508 int i, ret, len;
509 opal_cmd_line_t cmd_line;
510 char **app_env = NULL, **global_env = NULL;
511
512 opal_restart_globals.help = false;
513 opal_restart_globals.verbose = false;
514 opal_restart_globals.snapshot_ref = NULL;
515 opal_restart_globals.snapshot_loc = NULL;
516 opal_restart_globals.snapshot_metadata = NULL;
517 opal_restart_globals.snapshot_cache = NULL;
518 opal_restart_globals.snapshot_compress = NULL;
519 opal_restart_globals.snapshot_compress_postfix = NULL;
520 opal_restart_globals.output = 0;
521
522
523 opal_cmd_line_create(&cmd_line, cmd_line_opts);
524
525 mca_base_open();
526 mca_base_cmd_line_setup(&cmd_line);
527 ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
528 if (OPAL_SUCCESS != ret) {
529 if (OPAL_ERR_SILENT != ret) {
530 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
531 opal_strerror(ret));
532 }
533 return 1;
534 }
535 if (opal_restart_globals.help ) {
536 char *str, *args = NULL;
537 args = opal_cmd_line_get_usage_msg(&cmd_line);
538 str = opal_show_help_string("help-opal-restart.txt", "usage", true,
539 args);
540 if (NULL != str) {
541 printf("%s", str);
542 free(str);
543 }
544 free(args);
545
546 exit(0);
547 }
548
549
550
551
552 mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
553
554 len = opal_argv_count(app_env);
555 for(i = 0; i < len; ++i) {
556 putenv(app_env[i]);
557 }
558
559 len = opal_argv_count(global_env);
560 for(i = 0; i < len; ++i) {
561 putenv(global_env[i]);
562 }
563
564
565
566
567
568 opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
569
570 if ( NULL == opal_restart_globals.snapshot_ref ||
571 0 >= strlen(opal_restart_globals.snapshot_ref) ) {
572 opal_show_help("help-opal-restart.txt", "invalid_filename", true,
573 "<none provided>");
574 return OPAL_ERROR;
575 }
576
577
578
579
580
581 if(argc > 0) {
582 opal_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
583 }
584
585 return OPAL_SUCCESS;
586 }
587
588 static int check_file(void)
589 {
590 int exit_status = OPAL_SUCCESS;
591 int ret;
592 char * path_to_check = NULL;
593
594 if(NULL == opal_restart_globals.snapshot_ref) {
595 opal_output(opal_restart_globals.output,
596 "Error: No filename provided!");
597 exit_status = OPAL_ERROR;
598 goto cleanup;
599 }
600
601
602
603
604 opal_asprintf(&path_to_check, "%s/%s",
605 opal_restart_globals.snapshot_loc,
606 opal_restart_globals.snapshot_ref);
607
608 opal_output_verbose(10, opal_restart_globals.output,
609 "Checking for the existence of (%s)",
610 path_to_check);
611
612 if (0 > (ret = access(path_to_check, F_OK)) ) {
613 exit_status = OPAL_ERROR;
614 goto cleanup;
615 }
616
617 cleanup:
618 if( NULL != path_to_check) {
619 free(path_to_check);
620 path_to_check = NULL;
621 }
622
623 return exit_status;
624 }
625
626 static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
627 {
628 int ret, exit_status = OPAL_SUCCESS;
629 char *command = NULL;
630 char *proc_file = NULL;
631 char **loc_touch = NULL;
632 char **loc_mkdir = NULL;
633 int argc, i;
634
635 if( 0 > prev_pid ) {
636 opal_output(opal_restart_globals.output,
637 "Invalid PID (%d)\n",
638 prev_pid);
639 exit_status = OPAL_ERROR;
640 goto cleanup;
641 }
642
643
644
645
646
647 opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
648 opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file);
649
650 opal_output_verbose(5, opal_restart_globals.output,
651 "post_env_vars: Execute: <%s>", command);
652
653 ret = system(command);
654 if( 0 > ret) {
655 exit_status = ret;
656 goto cleanup;
657 }
658
659
660
661
662 if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
663 opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
664 opal_restart_globals.snapshot_metadata,
665 snapshot->metadata_filename);
666 exit_status = OPAL_ERROR;
667 goto cleanup;
668 }
669 opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
670 argc = opal_argv_count(loc_mkdir);
671 for( i = 0; i < argc; ++i ) {
672 if( NULL != command ) {
673 free(command);
674 command = NULL;
675 }
676 opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]);
677
678 opal_output_verbose(5, opal_restart_globals.output,
679 "post_env_vars: Execute: <%s>", command);
680
681 ret = system(command);
682 if( 0 > ret) {
683 exit_status = ret;
684 goto cleanup;
685 }
686 }
687 if( 0 < argc ) {
688 system("sync ; sync");
689 }
690
691
692
693
694 opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
695 argc = opal_argv_count(loc_touch);
696 for( i = 0; i < argc; ++i ) {
697 if( NULL != command ) {
698 free(command);
699 command = NULL;
700 }
701 opal_asprintf(&command, "touch %s", loc_touch[i]);
702
703 opal_output_verbose(5, opal_restart_globals.output,
704 "post_env_vars: Execute: <%s>", command);
705
706 ret = system(command);
707 if( 0 > ret) {
708 exit_status = ret;
709 goto cleanup;
710 }
711 }
712 if( 0 < argc ) {
713 system("sync ; sync");
714 }
715
716 cleanup:
717 if( NULL != command) {
718 free(command);
719 command = NULL;
720 }
721 if( NULL != proc_file) {
722 free(proc_file);
723 proc_file = NULL;
724 }
725 if( NULL != loc_mkdir ) {
726 opal_argv_free(loc_mkdir);
727 loc_mkdir = NULL;
728 }
729 if( NULL != loc_touch ) {
730 opal_argv_free(loc_touch);
731 loc_touch = NULL;
732 }
733
734 if( NULL != snapshot->metadata ) {
735 fclose(snapshot->metadata);
736 snapshot->metadata = NULL;
737 }
738
739 return exit_status;
740 }