This source file includes following definitions.
- opal_crs_self_construct
- opal_crs_self_destruct
- opal_crs_self_component_query
- opal_crs_self_extract_callbacks
- opal_crs_self_module_init
- opal_crs_self_module_finalize
- opal_crs_self_checkpoint
- opal_crs_self_restart
- opal_crs_self_disable_checkpoint
- opal_crs_self_enable_checkpoint
- opal_crs_self_prelaunch
- opal_crs_self_reg_thread
- crs_self_find_function
- opal_crs_self_restart_cmd
- self_cold_start
- self_update_snapshot_metadata
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 #include "opal_config.h"
23
24 #include <sys/types.h>
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h>
27 #endif
28 #include <string.h>
29 #ifdef HAVE_DLFCN_H
30 #include <dlfcn.h>
31 #endif
32
33 #include "opal/util/opal_environ.h"
34 #include "opal/util/output.h"
35 #include "opal/util/show_help.h"
36 #include "opal/util/argv.h"
37 #include "opal/util/opal_environ.h"
38 #include "opal/util/printf.h"
39
40 #include "opal/constants.h"
41 #include "opal/mca/base/mca_base_var.h"
42
43 #include "opal/mca/crs/crs.h"
44 #include "opal/mca/crs/base/base.h"
45 #include "opal/runtime/opal_cr.h"
46
47 #include "crs_self.h"
48
49
50
51
52 static opal_crs_base_module_t loc_module = {
53
54 opal_crs_self_module_init,
55
56 opal_crs_self_module_finalize,
57
58
59 opal_crs_self_checkpoint,
60
61
62 opal_crs_self_restart,
63
64
65 opal_crs_self_disable_checkpoint,
66
67 opal_crs_self_enable_checkpoint,
68
69
70 opal_crs_self_prelaunch,
71
72
73 opal_crs_self_reg_thread
74 };
75
76
77
78
79 OBJ_CLASS_DECLARATION(opal_crs_self_snapshot_t);
80
81 struct opal_crs_self_snapshot_t {
82
83 opal_crs_base_snapshot_t super;
84
85 char * cmd_line;
86 };
87 typedef struct opal_crs_self_snapshot_t opal_crs_self_snapshot_t;
88
89 static void opal_crs_self_construct(opal_crs_self_snapshot_t *obj);
90 static void opal_crs_self_destruct( opal_crs_self_snapshot_t *obj);
91
92 OBJ_CLASS_INSTANCE(opal_crs_self_snapshot_t,
93 opal_crs_base_snapshot_t,
94 opal_crs_self_construct,
95 opal_crs_self_destruct);
96
97
98 typedef void (*opal_crs_self_dlsym_dummy_fn_t)(void);
99
100
101
102
103 static int crs_self_find_function(char *prefix, char *suffix,
104 opal_crs_self_dlsym_dummy_fn_t *fn_ptr);
105
106 static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot);
107
108 static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd);
109 static int self_cold_start(opal_crs_self_snapshot_t *snapshot);
110
111 void opal_crs_self_construct(opal_crs_self_snapshot_t *snapshot)
112 {
113 snapshot->cmd_line = NULL;
114 }
115
116 void opal_crs_self_destruct( opal_crs_self_snapshot_t *snapshot)
117 {
118 if(NULL != snapshot->cmd_line)
119 free(snapshot->cmd_line);
120 }
121
122 static int opal_crs_self_extract_callbacks(void);
123
124
125
126
127 int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
128 {
129 int ret;
130
131 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
132 "crs:self: component_query()");
133
134
135
136
137
138
139
140
141
142 if( opal_cr_is_tool ) {
143 *priority = 0;
144 *module = (mca_base_module_t *)&loc_module;
145 return OPAL_SUCCESS;
146 }
147
148
149
150
151 ret = opal_crs_self_extract_callbacks();
152
153 if( OPAL_SUCCESS != ret ||
154 !mca_crs_self_component.can_checkpoint ) {
155 *priority = -1;
156 *module = NULL;
157 return OPAL_ERROR;
158 }
159 else {
160 *priority = mca_crs_self_component.super.priority;
161 *module = (mca_base_module_t *)&loc_module;
162 return OPAL_SUCCESS;
163 }
164 }
165
166 static int opal_crs_self_extract_callbacks(void)
167 {
168 opal_crs_self_dlsym_dummy_fn_t loc_fn;
169
170
171
172
173 crs_self_find_function(mca_crs_self_component.prefix,
174 SUFFIX_CHECKPOINT,
175 &loc_fn);
176 mca_crs_self_component.ucb_checkpoint_fn = (opal_crs_self_checkpoint_callback_fn_t)loc_fn;
177
178 crs_self_find_function(mca_crs_self_component.prefix,
179 SUFFIX_CONTINUE,
180 &loc_fn);
181 mca_crs_self_component.ucb_continue_fn = (opal_crs_self_continue_callback_fn_t)loc_fn;
182
183 crs_self_find_function(mca_crs_self_component.prefix,
184 SUFFIX_RESTART,
185 &loc_fn);
186 mca_crs_self_component.ucb_restart_fn = (opal_crs_self_restart_callback_fn_t)loc_fn;
187
188
189
190
191 mca_crs_self_component.can_checkpoint = true;
192
193 if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
194 mca_crs_self_component.can_checkpoint = false;
195 }
196 if(NULL == mca_crs_self_component.ucb_continue_fn) {
197 }
198 if(NULL == mca_crs_self_component.ucb_restart_fn) {
199 }
200
201 return OPAL_SUCCESS;
202 }
203
204 int opal_crs_self_module_init(void)
205 {
206 bool callback_matched = true;
207
208 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
209 "crs:self: module_init()");
210
211 if( opal_cr_is_tool ) {
212 return OPAL_SUCCESS;
213 }
214
215
216
217
218 if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
219 callback_matched = false;
220 mca_crs_self_component.can_checkpoint = false;
221 }
222 if(NULL == mca_crs_self_component.ucb_continue_fn) {
223 callback_matched = false;
224 }
225 if(NULL == mca_crs_self_component.ucb_restart_fn) {
226 callback_matched = false;
227 }
228 if( !callback_matched ) {
229 if( 1 <= mca_crs_self_component.super.verbose ) {
230 opal_show_help("help-opal-crs-self.txt", "self:no_callback", false,
231 "checkpoint", mca_crs_self_component.prefix, SUFFIX_CHECKPOINT,
232 "continue ", mca_crs_self_component.prefix, SUFFIX_CONTINUE,
233 "restart ", mca_crs_self_component.prefix, SUFFIX_RESTART,
234 PREFIX_DEFAULT);
235 }
236 }
237
238
239
240
241 if(mca_crs_self_component.do_restart) {
242 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
243 "crs:self: module_init: Call their restart function");
244 if( NULL != mca_crs_self_component.ucb_restart_fn)
245 mca_crs_self_component.ucb_restart_fn();
246 }
247
248 return OPAL_SUCCESS;
249 }
250
251 int opal_crs_self_module_finalize(void)
252 {
253 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
254 "crs:self: module_finalize()");
255
256 return OPAL_SUCCESS;
257 }
258
259
260 int opal_crs_self_checkpoint(pid_t pid,
261 opal_crs_base_snapshot_t *base_snapshot,
262 opal_crs_base_ckpt_options_t *options,
263 opal_crs_state_type_t *state)
264 {
265 opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
266 int ret, exit_status = OPAL_SUCCESS;
267 char * restart_cmd = NULL;
268
269
270
271
272 if( opal_cr_is_tool ) {
273 return OPAL_ERR_NOT_SUPPORTED;
274 }
275
276 if( options->stop ) {
277 opal_output(0,
278 "crs:self: checkpoint(): Error: SIGSTOP Not currently supported!");
279 }
280
281
282
283
284 snapshot->super = *base_snapshot;
285 #if 0
286 snapshot->super.snapshot_directory = strdup(base_snapshot->snapshot_directory);
287 snapshot->super.metadata_filename = strdup(base_snapshot->metadata_filename);
288 #endif
289
290 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
291 "crs:self: checkpoint(%d, ---)", pid);
292
293 if(!mca_crs_self_component.can_checkpoint) {
294 opal_show_help("help-opal-crs-self.txt", "self:ckpt_disabled", false);
295 exit_status = OPAL_ERROR;
296 goto cleanup;
297 }
298
299
300
301
302 snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
303 if( NULL == snapshot->super.metadata ) {
304 if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
305 opal_output(mca_crs_self_component.super.output_handle,
306 "crs:self: checkpoint(): Error: Unable to open the file (%s)",
307 snapshot->super.metadata_filename);
308 exit_status = OPAL_ERROR;
309 goto cleanup;
310 }
311 }
312 fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
313
314
315
316
317 if(NULL != mca_crs_self_component.ucb_checkpoint_fn) {
318 mca_crs_self_component.ucb_checkpoint_fn(&restart_cmd);
319 }
320
321
322
323
324 if( NULL == restart_cmd) {
325 *state = OPAL_CRS_ERROR;
326 opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
327 true);
328 exit_status = OPAL_ERROR;
329 goto cleanup;
330 }
331 else {
332 snapshot->cmd_line = strdup(restart_cmd);
333
334 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
335 "crs:self: checkpoint: Restart Command (%s)", snapshot->cmd_line);
336 }
337
338
339
340
341
342 if( OPAL_SUCCESS != (ret = self_update_snapshot_metadata(snapshot)) ) {
343 *state = OPAL_CRS_ERROR;
344 opal_output(mca_crs_self_component.super.output_handle,
345 "crs:self: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
346 snapshot->super.metadata_filename);
347 exit_status = ret;
348 goto cleanup;
349 }
350
351
352 *state = OPAL_CRS_CONTINUE;
353
354
355
356
357 if(NULL != mca_crs_self_component.ucb_continue_fn) {
358 mca_crs_self_component.ucb_continue_fn();
359 }
360
361 base_snapshot = &(snapshot->super);
362
363 cleanup:
364 if( NULL != restart_cmd) {
365 free(restart_cmd);
366 restart_cmd = NULL;
367 }
368
369 return exit_status;
370 }
371
372
373
374
375
376 int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
377 {
378 opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
379 char **cr_argv = NULL;
380 char * cr_cmd = NULL;
381 int ret;
382 int exit_status = OPAL_SUCCESS;
383 int status;
384
385 snapshot->super = *base_snapshot;
386
387 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
388 "crs:self: restart(%d)", spawn_child);
389
390
391
392
393 if(snapshot->super.cold_start) {
394 if( OPAL_SUCCESS != (ret = self_cold_start(snapshot)) ){
395 exit_status = ret;
396 opal_output(mca_crs_self_component.super.output_handle,
397 "crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
398 goto cleanup;
399 }
400 }
401
402
403
404
405
406
407
408
409 if ( OPAL_SUCCESS != (ret = opal_crs_self_restart_cmd(snapshot, &cr_cmd)) ) {
410 exit_status = ret;
411 goto cleanup;
412 }
413 if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
414 exit_status = OPAL_ERROR;
415 goto cleanup;
416 }
417
418
419 if (!spawn_child) {
420 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
421 "crs:self: self_restart: SELF: exec :(%s, %s):",
422 strdup(cr_argv[0]),
423 opal_argv_join(cr_argv, ' '));
424
425 status = execvp(strdup(cr_argv[0]), cr_argv);
426
427 if(status < 0) {
428 opal_output(mca_crs_self_component.super.output_handle,
429 "crs:self: self_restart: SELF: Child failed to execute :(%d):", status);
430 }
431 opal_output(mca_crs_self_component.super.output_handle,
432 "crs:self: self_restart: SELF: execvp returned %d", status);
433 exit_status = status;
434 goto cleanup;
435 }
436 else {
437 *child_pid = fork();
438 if( *child_pid == 0) {
439
440 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
441 "crs:self: self_restart: CHILD: exec :(%s, %s):",
442 strdup(cr_argv[0]),
443 opal_argv_join(cr_argv, ' '));
444
445 status = execvp(strdup(cr_argv[0]), cr_argv);
446
447 if(status < 0) {
448 opal_output(mca_crs_self_component.super.output_handle,
449 "crs:self: self_restart: CHILD: Child failed to execute :(%d):", status);
450 }
451 opal_output(mca_crs_self_component.super.output_handle,
452 "crs:self: self_restart: CHILD: execvp returned %d", status);
453 exit_status = status;
454 goto cleanup;
455 }
456 else if(*child_pid > 0) {
457
458 ;
459 }
460 else {
461 opal_output(mca_crs_self_component.super.output_handle,
462 "crs:self: self_restart: CHILD: fork failed :(%d):", *child_pid);
463 }
464 }
465
466 cleanup:
467 if( NULL != cr_cmd)
468 free(cr_cmd);
469 if( NULL != cr_argv)
470 opal_argv_free(cr_argv);
471
472 return exit_status;
473 }
474
475 int opal_crs_self_disable_checkpoint(void)
476 {
477
478
479
480 if( opal_cr_is_tool ) {
481 return OPAL_ERR_NOT_SUPPORTED;
482 }
483
484 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
485 "crs:self: disable_checkpoint()");
486
487 mca_crs_self_component.can_checkpoint = false;
488
489 return OPAL_SUCCESS;
490 }
491
492 int opal_crs_self_enable_checkpoint(void)
493 {
494
495
496
497 if( opal_cr_is_tool ) {
498 return OPAL_ERR_NOT_SUPPORTED;
499 }
500
501 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
502 "crs:self: enable_checkpoint()");
503
504 mca_crs_self_component.can_checkpoint = true;
505
506 return OPAL_SUCCESS;
507 }
508
509 int opal_crs_self_prelaunch(int32_t rank,
510 char *base_snapshot_dir,
511 char **app,
512 char **cwd,
513 char ***argv,
514 char ***env)
515 {
516 char * tmp_env_var = NULL;
517
518
519
520
521 if( opal_cr_is_tool ) {
522 return OPAL_ERR_NOT_SUPPORTED;
523 }
524
525 (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
526 opal_setenv(tmp_env_var,
527 "0", true, env);
528 free(tmp_env_var);
529 tmp_env_var = NULL;
530
531 return OPAL_SUCCESS;
532 }
533
534 int opal_crs_self_reg_thread(void)
535 {
536
537
538
539 if( opal_cr_is_tool ) {
540 return OPAL_ERR_NOT_SUPPORTED;
541 }
542
543 return OPAL_SUCCESS;
544 }
545
546
547
548
549 static int crs_self_find_function(char *prefix, char *suffix,
550 opal_crs_self_dlsym_dummy_fn_t *fn_ptr) {
551 char *func_to_find = NULL;
552
553 if( NULL == prefix || 0 >= strlen(prefix) ) {
554 opal_output(mca_crs_self_component.super.output_handle,
555 "crs:self: crs_self_find_function: Error: prefix is NULL or empty string!");
556 *fn_ptr = NULL;
557 return OPAL_ERROR;
558 }
559 if( NULL == suffix || 0 >= strlen(suffix) ) {
560 opal_output(mca_crs_self_component.super.output_handle,
561 "crs:self: crs_self_find_function: Error: suffix is NULL or empty string!");
562 *fn_ptr = NULL;
563 return OPAL_ERROR;
564 }
565
566 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
567 "crs:self: crs_self_find_function(--, %s, %s)",
568 prefix, suffix);
569
570 opal_asprintf(&func_to_find, "%s_%s", prefix, suffix);
571
572
573
574
575
576
577 *((void**) fn_ptr) = dlsym(RTLD_DEFAULT, func_to_find);
578 if( NULL == fn_ptr) {
579 opal_output_verbose(12, mca_crs_self_component.super.output_handle,
580 "crs:self: crs_self_find_function: WARNING: Function \"%s\" not found",
581 func_to_find);
582 }
583 else {
584 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
585 "crs:self: crs_self_find_function: Found function \"%s\"",
586 func_to_find);
587 }
588
589 if( NULL == func_to_find) {
590 free(func_to_find);
591 }
592
593 return OPAL_SUCCESS;
594 }
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625 static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
626 {
627 char * tmp_env_var = NULL;
628
629 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
630 "crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
631
632 (void) mca_base_var_env_name("crs", &tmp_env_var);
633 opal_setenv(tmp_env_var,
634 "self",
635 true, &environ);
636 free(tmp_env_var);
637 tmp_env_var = NULL;
638
639 (void) mca_base_var_env_name("crs_self_do_restart", &tmp_env_var);
640 opal_setenv(tmp_env_var,
641 "1",
642 true, &environ);
643 free(tmp_env_var);
644 tmp_env_var = NULL;
645
646 (void) mca_base_var_env_name("crs_self_prefix", &tmp_env_var);
647 opal_setenv(tmp_env_var,
648 mca_crs_self_component.prefix,
649 true, &environ);
650 free(tmp_env_var);
651 tmp_env_var = NULL;
652
653
654
655
656
657
658
659 opal_asprintf(cmd, "%s", snapshot->cmd_line);
660
661 return OPAL_SUCCESS;
662 }
663
664 static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
665 int ret, exit_status = OPAL_SUCCESS;
666 char **tmp_argv = NULL;
667 char * component_name = NULL;
668 int prev_pid;
669
670 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
671 "crs:self: cold_start()");
672
673
674
675
676 if( NULL == snapshot->super.metadata ) {
677 if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
678 opal_output(mca_crs_self_component.super.output_handle,
679 "crs:self: checkpoint(): Error: Unable to open the file (%s)",
680 snapshot->super.metadata_filename);
681 exit_status = OPAL_ERROR;
682 goto cleanup;
683 }
684 }
685 if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
686 &component_name, &prev_pid) ) ) {
687 opal_output(mca_crs_self_component.super.output_handle,
688 "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
689 snapshot->super.metadata_filename, ret);
690 exit_status = ret;
691 goto cleanup;
692 }
693
694 snapshot->super.component_name = strdup(component_name);
695
696
697 if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name,
698 component_name, strlen(component_name)) ) {
699 exit_status = OPAL_ERROR;
700 opal_output(mca_crs_self_component.super.output_handle,
701 "crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
702 component_name, mca_crs_self_component.super.base_version.mca_component_name);
703 goto cleanup;
704 }
705
706
707
708
709
710 opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
711 if( NULL == tmp_argv ) {
712 opal_output(mca_crs_self_component.super.output_handle,
713 "crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
714 CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
715 exit_status = OPAL_ERROR;
716 goto cleanup;
717 }
718 opal_asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);
719
720
721
722
723 snapshot->super.cold_start = false;
724
725 cleanup:
726 if(NULL != tmp_argv) {
727 opal_argv_free(tmp_argv);
728 tmp_argv = NULL;
729 }
730
731 return exit_status;
732
733 }
734
735 static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
736 int exit_status = OPAL_SUCCESS;
737
738 if(NULL == snapshot->cmd_line) {
739 opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
740 true);
741 exit_status = OPAL_ERROR;
742 goto cleanup;
743 }
744
745 opal_output_verbose(10, mca_crs_self_component.super.output_handle,
746 "crs:self: update_snapshot_metadata(%s)",
747 snapshot->super.metadata_filename);
748
749
750
751
752
753 fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->cmd_line);
754
755 cleanup:
756 return exit_status;
757 }