This source file includes following definitions.
- opal_cr_set_enabled
- opal_cr_register
- opal_cr_init
- opal_cr_finalize
- opal_cr_test_if_checkpoint_ready
- opal_cr_inc_core_prep
- opal_cr_inc_core_ckpt
- opal_cr_inc_core_recover
- opal_cr_inc_core
- opal_cr_coord
- opal_cr_reg_notify_callback
- opal_cr_user_inc_register_callback
- ompi_trigger_user_inc_callback
- opal_cr_reg_coord_callback
- opal_cr_refresh_environ
- extract_env_vars
- opal_cr_sigpipe_debug_signal_handler
- opal_cr_thread_fn
- opal_cr_thread_init_library
- opal_cr_thread_finalize_library
- opal_cr_thread_abort_library
- opal_cr_thread_enter_library
- opal_cr_thread_exit_library
- opal_cr_thread_noop_progress
- opal_cr_get_time
- opal_cr_set_time
- opal_cr_clear_timers
- display_indv_timer_core
- opal_cr_display_all_timers
- opal_cr_debug_set_current_ckpt_thread_self
- opal_cr_debug_clear_current_ckpt_thread
- MPIR_checkpoint_debugger_detach
- MPIR_checkpoint_debugger_signal_handler
- MPIR_checkpoint_debugger_waitpoint
- MPIR_checkpoint_debugger_breakpoint
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33 #include "opal_config.h"
34
35 #include <string.h>
36 #include <errno.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #ifdef HAVE_FCNTL_H
41 #include <fcntl.h>
42 #endif
43 #ifdef HAVE_SYS_TYPES_H
44 #include <sys/types.h>
45 #endif
46 #ifdef HAVE_SYS_STAT_H
47 #include <sys/stat.h>
48 #endif
49 #include <signal.h>
50
51 #include "opal/class/opal_object.h"
52 #include "opal/util/opal_environ.h"
53 #include "opal/util/show_help.h"
54 #include "opal/util/output.h"
55 #include "opal/util/malloc.h"
56 #include "opal/util/keyval_parse.h"
57 #include "opal/util/opal_environ.h"
58 #include "opal/util/argv.h"
59 #include "opal/util/printf.h"
60 #include "opal/memoryhooks/memory.h"
61
62 #include "opal/mca/base/base.h"
63 #include "opal/runtime/opal_cr.h"
64 #include "opal/runtime/opal.h"
65 #include "opal/constants.h"
66
67 #include "opal/mca/if/base/base.h"
68 #include "opal/mca/memcpy/base/base.h"
69 #include "opal/mca/memory/base/base.h"
70 #include "opal/mca/timer/base/base.h"
71
72 #include "opal/threads/mutex.h"
73 #include "opal/threads/threads.h"
74 #include "opal/mca/crs/base/base.h"
75
76
77
78
79 #if OPAL_ENABLE_CRDEBUG == 1
80 static opal_thread_t **opal_cr_debug_free_threads = NULL;
81 static int opal_cr_debug_num_free_threads = 0;
82 static int opal_cr_debug_threads_already_waiting = false;
83
84 int MPIR_debug_with_checkpoint = 0;
85 static volatile int MPIR_checkpoint_debug_gate = 0;
86
87 int opal_cr_debug_signal = 0;
88 #endif
89
90 bool opal_cr_stall_check = false;
91 bool opal_cr_currently_stalled = false;
92 int opal_cr_output = -1;
93 int opal_cr_verbose = 0;
94 int opal_cr_initalized = 0;
95
96 static double opal_cr_get_time(void);
97 static void display_indv_timer_core(double diff, char *str);
98 static double timer_start[OPAL_CR_TIMER_MAX];
99 bool opal_cr_timing_barrier_enabled = false;
100 bool opal_cr_timing_enabled = false;
101 int opal_cr_timing_my_rank = 0;
102 int opal_cr_timing_target_rank = 0;
103
104
105
106
107 static int extract_env_vars(int prev_pid, char * file_name);
108
109 static void opal_cr_sigpipe_debug_signal_handler (int signo);
110
111 static opal_cr_user_inc_callback_fn_t cur_user_coord_callback[OPAL_CR_INC_MAX] = {NULL};
112 static opal_cr_coord_callback_fn_t cur_coord_callback = NULL;
113 static opal_cr_notify_callback_fn_t cur_notify_callback = NULL;
114
115 static int core_prev_pid = 0;
116
117
118
119
120 char * opal_cr_pipe_dir = NULL;
121 int opal_cr_entry_point_signal = 0;
122 bool opal_cr_is_enabled = true;
123 bool opal_cr_is_tool = false;
124
125
126 int opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
127
128
129 int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
130
131 static bool opal_cr_debug_sigpipe = false;
132
133 bool opal_cr_continue_like_restart = false;
134
135 #if OPAL_ENABLE_FT_THREAD == 1
136
137
138
139 static void* opal_cr_thread_fn(opal_object_t *obj);
140 bool opal_cr_thread_is_done = false;
141 bool opal_cr_thread_is_active = false;
142 bool opal_cr_thread_in_library = false;
143 bool opal_cr_thread_use_if_avail = true;
144 int32_t opal_cr_thread_num_in_library = 0;
145 int opal_cr_thread_sleep_check = 0;
146 int opal_cr_thread_sleep_wait = 0;
147 opal_thread_t opal_cr_thread;
148 opal_mutex_t opal_cr_thread_lock;
149 #if 0
150 #define OPAL_CR_LOCK() opal_cr_thread_in_library = true; opal_mutex_lock(&opal_cr_thread_lock);
151 #define OPAL_CR_UNLOCK() opal_cr_thread_in_library = false; opal_mutex_unlock(&opal_cr_thread_lock);
152 #define OPAL_CR_THREAD_LOCK() opal_mutex_lock(&opal_cr_thread_lock);
153 #define OPAL_CR_THREAD_UNLOCK() opal_mutex_unlock(&opal_cr_thread_lock);
154 #else
155
156
157
158 static const uint32_t ThreadFlag = 0x1;
159 static const uint32_t ProcInc = 0x2;
160
161 #define OPAL_CR_LOCK() \
162 { \
163 opal_cr_thread_in_library = true; \
164 OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, ProcInc); \
165 while( (opal_cr_thread_num_in_library & ThreadFlag ) != 0 ) { \
166 sched_yield(); \
167 } \
168 }
169 #define OPAL_CR_UNLOCK() \
170 { \
171 OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, -ProcInc); \
172 if( opal_cr_thread_num_in_library <= 0 ) { \
173 opal_cr_thread_in_library = false; \
174 } \
175 }
176 #define OPAL_CR_THREAD_LOCK() \
177 { \
178 int32_t _tmp_value = 0; \
179 while(!OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_32 (&opal_cr_thread_num_in_library, &_tmp_value, ThreadFlag)) { \
180 if( !opal_cr_thread_is_active && opal_cr_thread_is_done) { \
181 break; \
182 } \
183 sched_yield(); \
184 usleep(opal_cr_thread_sleep_check); \
185 } \
186 }
187 #define OPAL_CR_THREAD_UNLOCK() \
188 { \
189 OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, -ThreadFlag); \
190 }
191 #endif
192
193 #endif
194
195 int opal_cr_set_enabled(bool en)
196 {
197 opal_cr_is_enabled = en;
198 return OPAL_SUCCESS;
199 }
200
201 static int opal_cr_register (void)
202 {
203 int ret;
204 #if OPAL_ENABLE_CRDEBUG == 1
205 int t;
206 #endif
207
208
209
210
211 ret = mca_base_var_register ("opal", "opal", "cr", "verbose",
212 "Verbose output level for the runtime OPAL Checkpoint/Restart functionality",
213 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
214 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
215 &opal_cr_verbose);
216 if (0 > ret) {
217 return ret;
218 }
219
220 opal_cr_is_enabled = false;
221 (void) mca_base_var_register("opal", "ft", "cr", "enabled",
222 "Enable fault tolerance for this program",
223 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
224 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
225 &opal_cr_is_enabled);
226
227 opal_cr_timing_enabled = false;
228 (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer",
229 "Enable Checkpoint timer (Default: Disabled)",
230 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
231 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
232 &opal_cr_timing_enabled);
233
234 opal_cr_timing_barrier_enabled = false;
235 (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer_barrier",
236 "Enable Checkpoint timer Barrier. Must have opal_cr_enable_timer set. (Default: Disabled)",
237 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, opal_cr_timing_enabled ? MCA_BASE_VAR_FLAG_SETTABLE : 0,
238 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
239 &opal_cr_timing_barrier_enabled);
240 opal_cr_timing_barrier_enabled = opal_cr_timing_barrier_enabled && opal_cr_timing_enabled;
241
242 (void) mca_base_var_register ("opal", "opal", "cr", "timer_target_rank",
243 "Target Rank for the timer (Default: 0)",
244 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
245 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
246 &opal_cr_timing_target_rank);
247
248 #if OPAL_ENABLE_FT_THREAD == 1
249 opal_cr_thread_use_if_avail = false;
250 (void) mca_base_var_register ("opal", "opal", "cr", "use_thread",
251 "Use an async thread to checkpoint this program (Default: Disabled)",
252 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
253 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
254 &opal_cr_thread_use_if_avail);
255
256 opal_cr_thread_sleep_check = 0;
257 (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_check",
258 "Time to sleep between checking for a checkpoint (Default: 0)",
259 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
260 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
261 &opal_cr_thread_sleep_check);
262
263 opal_cr_thread_sleep_wait = 100;
264 (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_wait",
265 "Time to sleep waiting for process to exit MPI library (Default: 1000)",
266 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
267 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
268 &opal_cr_thread_sleep_wait);
269 #endif
270
271 opal_cr_is_tool = false;
272 (void) mca_base_var_register ("opal", "opal", "cr", "is_tool",
273 "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
274 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
275 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
276 &opal_cr_is_tool);
277
278 #ifndef __WINDOWS__
279 opal_cr_entry_point_signal = SIGUSR1;
280 (void) mca_base_var_register ("opal", "opal", "cr", "signal",
281 "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program",
282 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
283 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
284 &opal_cr_entry_point_signal);
285
286 opal_cr_debug_sigpipe = false;
287 (void) mca_base_var_register ("opal", "opal", "cr", "debug_sigpipe",
288 "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
289 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
290 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
291 &opal_cr_debug_sigpipe);
292 #else
293 opal_cr_is_tool = true;
294 #endif
295
296 #if OPAL_ENABLE_CRDEBUG == 1
297 MPIR_debug_with_checkpoint = 0;
298 (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug",
299 "Enable checkpoint/restart debugging",
300 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
301 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
302 &MPIR_debug_with_checkpoint);
303
304 opal_cr_debug_num_free_threads = 3;
305 opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads );
306 for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) {
307 opal_cr_debug_free_threads[t] = NULL;
308 }
309
310 opal_cr_debug_signal = SIGTSTP;
311 (void) mca_base_var_register ("opal", "opal", "cr", "crdebug_signal",
312 "Checkpoint/Restart signal used to hold threads when debugging",
313 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
314 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
315 &opal_cr_debug_signal);
316 #endif
317
318 opal_cr_pipe_dir = (char *) opal_tmp_directory();
319 (void) mca_base_var_register ("opal", "opal", "cr", "tmp_dir",
320 "Temporary directory to place rendezvous files for a checkpoint",
321 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
322 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
323 &opal_cr_pipe_dir);
324
325 return OPAL_SUCCESS;
326 }
327
328
329 int opal_cr_init(void )
330 {
331 int ret, exit_status = OPAL_SUCCESS;
332 opal_cr_coord_callback_fn_t prev_coord_func;
333
334 if( ++opal_cr_initalized != 1 ) {
335 if( opal_cr_initalized < 1 ) {
336 exit_status = OPAL_ERROR;
337 goto cleanup;
338 }
339 exit_status = OPAL_SUCCESS;
340 goto cleanup;
341 }
342
343 ret = opal_cr_register ();
344 if (OPAL_SUCCESS != ret) {
345 return ret;
346 }
347
348 if(0 != opal_cr_verbose) {
349 opal_cr_output = opal_output_open(NULL);
350 opal_output_set_verbosity(opal_cr_output, opal_cr_verbose);
351 }
352
353 opal_output_verbose(10, opal_cr_output,
354 "opal_cr: init: Verbose Level: %d",
355 opal_cr_verbose);
356
357
358 opal_output_verbose(10, opal_cr_output,
359 "opal_cr: init: FT Enabled: %s",
360 opal_cr_is_enabled ? "true" : "false");
361
362
363 opal_output_verbose(10, opal_cr_output,
364 "opal_cr: init: Is a tool program: %s",
365 opal_cr_is_tool ? "true" : "false");
366
367 opal_output_verbose(10, opal_cr_output,
368 "opal_cr: init: Debug SIGPIPE: %d (%s)",
369 opal_cr_verbose, (opal_cr_debug_sigpipe ? "True" : "False"));
370
371 opal_output_verbose(10, opal_cr_output,
372 "opal_cr: init: Checkpoint Signal: %d",
373 opal_cr_entry_point_signal);
374
375 #if OPAL_ENABLE_FT_THREAD == 1
376 opal_output_verbose(10, opal_cr_output,
377 "opal_cr: init: FT Use thread: %s",
378 opal_cr_thread_use_if_avail ? "true" : "false");
379
380 opal_output_verbose(10, opal_cr_output,
381 "opal_cr: init: FT thread sleep: check = %d, wait = %d",
382 opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
383
384
385
386
387 if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
388 if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
389 ;
390 }
391 }
392 #else
393 if( opal_cr_debug_sigpipe ) {
394 if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
395 ;
396 }
397 }
398 #endif
399
400 #if OPAL_ENABLE_CRDEBUG == 1
401 opal_output_verbose(10, opal_cr_output,
402 "opal_cr: init: C/R Debugging Enabled [%s]\n",
403 (MPIR_debug_with_checkpoint ? "True": "False"));
404
405 opal_output_verbose(10, opal_cr_output,
406 "opal_cr: init: Checkpoint Signal (Debug): %d",
407 opal_cr_debug_signal);
408
409 if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) {
410 opal_output(opal_cr_output,
411 "opal_cr: init: Failed to register C/R debug signal (%d)",
412 opal_cr_debug_signal);
413 }
414 #endif
415
416 opal_output_verbose(10, opal_cr_output,
417 "opal_cr: init: Temp Directory: %s",
418 opal_cr_pipe_dir);
419
420 if( !opal_cr_is_tool ) {
421
422 opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func);
423
424 opal_cr_stall_check = false;
425 opal_cr_currently_stalled = false;
426
427 }
428
429
430
431
432
433
434
435 #if OPAL_ENABLE_FT_CR == 1
436
437
438
439 if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_crs_base_framework, 0))) {
440 opal_show_help( "help-opal-runtime.txt",
441 "opal_cr_init:no-crs", true,
442 "opal_crs_base_open", ret );
443 exit_status = ret;
444 goto cleanup;
445 }
446
447 if (OPAL_SUCCESS != (ret = opal_crs_base_select())) {
448 opal_show_help( "help-opal-runtime.txt",
449 "opal_cr_init:no-crs", true,
450 "opal_crs_base_select", ret );
451 exit_status = ret;
452 goto cleanup;
453 }
454 #endif
455
456 #if OPAL_ENABLE_FT_THREAD == 1
457 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) {
458 opal_output_verbose(10, opal_cr_output,
459 "opal_cr: init: starting the thread\n");
460
461
462
463
464
465
466
467
468
469
470
471
472
473 OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t);
474 OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t);
475
476 opal_cr_thread_is_done = false;
477 opal_cr_thread_is_active = false;
478 opal_cr_thread_in_library = false;
479 opal_cr_thread_num_in_library = 0;
480
481 opal_cr_thread.t_run = opal_cr_thread_fn;
482 opal_cr_thread.t_arg = NULL;
483 opal_thread_start(&opal_cr_thread);
484
485 }
486 else {
487 opal_output_verbose(10, opal_cr_output,
488 "opal_cr: init: *Not* Using C/R thread\n");
489 }
490 #endif
491
492 cleanup:
493 return exit_status;
494 }
495
496 int opal_cr_finalize(void)
497 {
498 int exit_status = OPAL_SUCCESS;
499
500 if( --opal_cr_initalized != 0 ) {
501 if( opal_cr_initalized < 0 ) {
502 return OPAL_ERROR;
503 }
504 return OPAL_SUCCESS;
505 }
506
507 if( !opal_cr_is_tool ) {
508 #if OPAL_ENABLE_FT_THREAD == 1
509 if( opal_cr_thread_use_if_avail ) {
510 void *data;
511
512
513
514 opal_cr_thread_is_done = true;
515 opal_cr_thread_is_active = false;
516 opal_cr_thread_in_library = true;
517
518 opal_thread_join(&opal_cr_thread, &data);
519 OBJ_DESTRUCT(&opal_cr_thread);
520 OBJ_DESTRUCT(&opal_cr_thread_lock);
521 }
522 #endif
523
524
525 opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
526 opal_cr_checkpoint_request = OPAL_CR_STATUS_TERM;
527 }
528
529 #if OPAL_ENABLE_CRDEBUG == 1
530 if( NULL != opal_cr_debug_free_threads ) {
531 free( opal_cr_debug_free_threads );
532 opal_cr_debug_free_threads = NULL;
533 }
534 opal_cr_debug_num_free_threads = 0;
535 #endif
536
537 if (NULL != opal_cr_pipe_dir) {
538 free(opal_cr_pipe_dir);
539 opal_cr_pipe_dir = NULL;
540 }
541
542 #if OPAL_ENABLE_FT_CR == 1
543
544
545
546 (void) mca_base_framework_close(&opal_crs_base_framework);
547 #endif
548
549 return exit_status;
550 }
551
552
553
554
555 void opal_cr_test_if_checkpoint_ready(void)
556 {
557 int ret;
558
559 if( opal_cr_currently_stalled) {
560 opal_output_verbose(20, opal_cr_output,
561 "opal_cr:opal_test_if_ready: JUMPING to Post Stall stage");
562 goto STAGE_1;
563 }
564
565
566
567
568
569 if(OPAL_CR_STATUS_REQUESTED != opal_cr_checkpoint_request ) {
570 return;
571 }
572
573
574
575
576
577
578 if(OPAL_CR_STATUS_RUNNING == opal_cr_checkpointing_state ) {
579 if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_IN_PROGRESS) ) ) {
580 opal_output(opal_cr_output,
581 "Error: opal_cr: test_if_checkpoint_ready: Respond [In Progress] Failed. (%d)",
582 ret);
583 }
584 opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
585 return;
586 }
587
588
589
590
591 if (NULL == opal_crs.crs_checkpoint ) {
592 if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_NULL) ) ) {
593 opal_output(opal_cr_output,
594 "Error: opal_cr: test_if_checkpoint_ready: Respond [Not Able/NULL] Failed. (%d)",
595 ret);
596 }
597 opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
598 return;
599 }
600
601
602
603
604 opal_cr_checkpointing_state = OPAL_CR_STATUS_RUNNING;
605 opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
606
607 STAGE_1:
608 if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_START) ) ) {
609 opal_output(opal_cr_output,
610 "Error: opal_cr: test_if_checkpoint_ready: Respond [Start Ckpt] Failed. (%d)",
611 ret);
612 }
613
614 return;
615 }
616
617
618
619
620 int opal_cr_inc_core_prep(void)
621 {
622 int ret;
623
624
625
626
627 if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_PRE_CRS_PRE_MPI,
628 OPAL_CR_INC_STATE_PREPARE)) ) {
629 return ret;
630 }
631
632
633
634
635 if(OPAL_SUCCESS != (ret = cur_coord_callback(OPAL_CRS_CHECKPOINT)) ) {
636 if ( OPAL_EXISTS != ret ) {
637 opal_output(opal_cr_output,
638 "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
639 OPAL_CRS_CHECKPOINT, ret);
640 }
641 return ret;
642 }
643
644
645
646
647 if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_PRE_CRS_POST_MPI,
648 OPAL_CR_INC_STATE_PREPARE)) ) {
649 return ret;
650 }
651
652 core_prev_pid = getpid();
653
654 return OPAL_SUCCESS;
655 }
656
657 int opal_cr_inc_core_ckpt(pid_t pid,
658 opal_crs_base_snapshot_t *snapshot,
659 opal_crs_base_ckpt_options_t *options,
660 int *state)
661 {
662 int ret, exit_status = OPAL_SUCCESS;
663
664 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
665 if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid,
666 snapshot,
667 options,
668 (opal_crs_state_type_t *)state))) {
669 opal_output(opal_cr_output,
670 "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
671 exit_status = ret;
672 }
673
674 if(*state == OPAL_CRS_CONTINUE) {
675 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);
676
677 if(options->term) {
678 *state = OPAL_CRS_TERM;
679 opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
680 } else {
681 opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE;
682 }
683 }
684 else {
685 options->term = false;
686 }
687
688
689
690
691 if(*state == OPAL_CRS_RESTART) {
692 opal_cr_refresh_environ(core_prev_pid);
693 opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE;
694 }
695
696 return exit_status;
697 }
698
699 int opal_cr_inc_core_recover(int state)
700 {
701 int ret;
702 opal_cr_user_inc_callback_state_t cb_state;
703
704 if( opal_cr_checkpointing_state != OPAL_CR_STATUS_TERM &&
705 opal_cr_checkpointing_state != OPAL_CR_STATUS_CONTINUE &&
706 opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_PRE &&
707 opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_POST ) {
708
709 if(state == OPAL_CRS_CONTINUE) {
710 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);
711 opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE;
712 }
713
714
715
716 else if(state == OPAL_CRS_RESTART) {
717 opal_cr_refresh_environ(core_prev_pid);
718 opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE;
719 }
720 }
721
722
723
724
725 if( OPAL_CRS_CONTINUE == state ) {
726 cb_state = OPAL_CR_INC_STATE_CONTINUE;
727 }
728 else if( OPAL_CRS_RESTART == state ) {
729 cb_state = OPAL_CR_INC_STATE_RESTART;
730 }
731 else {
732 cb_state = OPAL_CR_INC_STATE_ERROR;
733 }
734
735 if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_PRE_MPI,
736 cb_state)) ) {
737 return ret;
738 }
739
740
741
742
743 if(OPAL_SUCCESS != (ret = cur_coord_callback(state)) ) {
744 if ( OPAL_EXISTS != ret ) {
745 opal_output(opal_cr_output,
746 "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
747 state, ret);
748 }
749 return ret;
750 }
751
752 if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_POST_MPI,
753 cb_state)) ) {
754 return ret;
755 }
756
757 #if OPAL_ENABLE_CRDEBUG == 1
758 opal_cr_debug_clear_current_ckpt_thread();
759 #endif
760
761 return OPAL_SUCCESS;
762 }
763
764 int opal_cr_inc_core(pid_t pid,
765 opal_crs_base_snapshot_t *snapshot,
766 opal_crs_base_ckpt_options_t *options,
767 int *state)
768 {
769 int ret, exit_status = OPAL_SUCCESS;
770
771
772
773
774 if(OPAL_SUCCESS != (ret = opal_cr_inc_core_prep() ) ) {
775 return ret;
776 }
777
778
779
780
781 if(OPAL_SUCCESS != (ret = opal_cr_inc_core_ckpt(pid, snapshot, options, state) ) ) {
782 exit_status = ret;
783
784 }
785
786
787
788
789 if(OPAL_SUCCESS != (ret = opal_cr_inc_core_recover(*state) ) ) {
790 return ret;
791 }
792
793 return exit_status;
794 }
795
796
797
798
799
800
801
802 int opal_cr_coord(int state)
803 {
804 if(OPAL_CRS_CHECKPOINT == state) {
805
806 }
807 else if (OPAL_CRS_CONTINUE == state ) {
808
809 }
810 else if (OPAL_CRS_RESTART == state ) {
811
812
813
814
815
816
817
818 opal_event_reinit(opal_sync_event_base);
819
820
821
822
823 (void) mca_base_framework_close(&opal_if_base_framework);
824
825
826
827
828 opal_output_reopen_all();
829 }
830 else if (OPAL_CRS_TERM == state ) {
831
832 }
833 else {
834
835
836
837 }
838
839
840
841
842
843 opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_POST;
844
845 return OPAL_SUCCESS;
846 }
847
848 int opal_cr_reg_notify_callback(opal_cr_notify_callback_fn_t new_func,
849 opal_cr_notify_callback_fn_t *prev_func)
850 {
851
852
853
854 if( NULL != cur_notify_callback) {
855 *prev_func = cur_notify_callback;
856 }
857 else {
858 *prev_func = NULL;
859 }
860
861
862
863
864 cur_notify_callback = new_func;
865
866 return OPAL_SUCCESS;
867 }
868
869 int opal_cr_user_inc_register_callback(opal_cr_user_inc_callback_event_t event,
870 opal_cr_user_inc_callback_fn_t function,
871 opal_cr_user_inc_callback_fn_t *prev_function)
872 {
873 if (event >= OPAL_CR_INC_MAX) {
874 return OPAL_ERROR;
875 }
876
877 if( NULL != cur_user_coord_callback[event] ) {
878 *prev_function = cur_user_coord_callback[event];
879 } else {
880 *prev_function = NULL;
881 }
882
883 cur_user_coord_callback[event] = function;
884
885 return OPAL_SUCCESS;
886 }
887
888 int ompi_trigger_user_inc_callback(opal_cr_user_inc_callback_event_t event,
889 opal_cr_user_inc_callback_state_t state)
890 {
891 if( NULL == cur_user_coord_callback[event] ) {
892 return OPAL_SUCCESS;
893 }
894
895 if (event >= OPAL_CR_INC_MAX) {
896 return OPAL_ERROR;
897 }
898
899 return ((cur_user_coord_callback[event])(event, state));
900 }
901
902 int opal_cr_reg_coord_callback(opal_cr_coord_callback_fn_t new_func,
903 opal_cr_coord_callback_fn_t *prev_func)
904 {
905
906
907
908 if( NULL != cur_coord_callback) {
909 *prev_func = cur_coord_callback;
910 }
911 else {
912 *prev_func = NULL;
913 }
914
915
916
917
918 cur_coord_callback = new_func;
919
920 return OPAL_SUCCESS;
921 }
922
923 int opal_cr_refresh_environ(int prev_pid) {
924 char *file_name;
925 #if OPAL_ENABLE_CRDEBUG == 1
926 char *tmp;
927 #endif
928 struct stat file_status;
929
930 if( 0 >= prev_pid ) {
931 prev_pid = getpid();
932 }
933
934
935
936
937
938
939 opal_asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
940 if (NULL == file_name) {
941 return OPAL_ERR_OUT_OF_RESOURCE;
942 }
943 if(0 != stat(file_name, &file_status) ){
944 free(file_name);
945 return OPAL_SUCCESS;
946 }
947
948 #if OPAL_ENABLE_CRDEBUG == 1
949 mca_base_var_env_name ("opal_cr_enable_crdebug", &tmp);
950 opal_unsetenv(tmp, &environ);
951 free (tmp);
952 #endif
953
954 extract_env_vars(prev_pid, file_name);
955
956 #if OPAL_ENABLE_CRDEBUG == 1
957 MPIR_debug_with_checkpoint = 0;
958 (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug",
959 "Enable checkpoint/restart debugging",
960 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
961 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
962 &MPIR_debug_with_checkpoint);
963
964 opal_output_verbose(10, opal_cr_output,
965 "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n",
966 (MPIR_debug_with_checkpoint ? "True": "False"));
967 #endif
968
969 free(file_name);
970
971 return OPAL_SUCCESS;
972 }
973
974
975
976
977
978 static int extract_env_vars(int prev_pid, char * file_name)
979 {
980 int exit_status = OPAL_SUCCESS;
981 FILE *env_data = NULL;
982 int len = OPAL_PATH_MAX;
983 char * tmp_str = NULL;
984
985 if( 0 >= prev_pid ) {
986 opal_output(opal_cr_output,
987 "opal_cr: extract_env_vars: Invalid PID (%d)\n",
988 prev_pid);
989 exit_status = OPAL_ERROR;
990 goto cleanup;
991 }
992
993 if (NULL == (env_data = fopen(file_name, "r")) ) {
994 exit_status = OPAL_ERROR;
995 goto cleanup;
996 }
997
998 tmp_str = (char *) malloc(sizeof(char) * OPAL_PATH_MAX);
999 if( NULL == tmp_str) {
1000 exit_status = OPAL_ERR_OUT_OF_RESOURCE;
1001 goto cleanup;
1002 }
1003
1004 while(!feof(env_data) ) {
1005 char **t_set = NULL;
1006
1007 if( NULL == fgets(tmp_str, OPAL_PATH_MAX, env_data) ) {
1008 exit_status = OPAL_ERROR;
1009 goto cleanup;
1010 }
1011 len = strlen(tmp_str);
1012 if(tmp_str[len - 1] == '\n') {
1013 tmp_str[len - 1] = '\0';
1014 } else {
1015 opal_output(opal_cr_output,
1016 "opal_cr: extract_env_vars: Error: Parameter too long (%s)\n",
1017 tmp_str);
1018 continue;
1019 }
1020
1021 if( NULL == (t_set = opal_argv_split(tmp_str, '=')) ) {
1022 break;
1023 }
1024
1025 opal_setenv(t_set[0], t_set[1], true, &environ);
1026
1027 opal_argv_free(t_set);
1028 }
1029
1030 cleanup:
1031 if( NULL != env_data ) {
1032 fclose(env_data);
1033 }
1034 unlink(file_name);
1035
1036 if( NULL != tmp_str ){
1037 free(tmp_str);
1038 }
1039
1040 return exit_status;
1041 }
1042
1043
1044
1045
1046
1047
1048
1049 static void opal_cr_sigpipe_debug_signal_handler (int signo)
1050 {
1051 int sleeper = 1;
1052
1053 if( !opal_cr_debug_sigpipe ) {
1054 opal_output_verbose(10, opal_cr_output,
1055 "opal_cr: sigpipe_debug: Debug SIGPIPE Not enabled :(\n");
1056 return;
1057 }
1058
1059 opal_output(0,
1060 "opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n",
1061 signo, getpid());
1062 while(sleeper == 1 ) {
1063 sleep(1);
1064 }
1065 }
1066
1067 #if OPAL_ENABLE_FT_THREAD == 1
1068 static void* opal_cr_thread_fn(opal_object_t *obj)
1069 {
1070
1071 if( !opal_cr_thread_use_if_avail ) {
1072 return NULL;
1073 }
1074
1075 if( opal_cr_debug_sigpipe ) {
1076 if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
1077 ;
1078 }
1079 }
1080
1081
1082
1083
1084 if( NULL != opal_crs.crs_reg_thread ) {
1085 if( OPAL_SUCCESS != opal_crs.crs_reg_thread() ) {
1086 opal_output(0, "Error: Thread registration failed\n");
1087 return NULL;
1088 }
1089 }
1090
1091 #if OPAL_ENABLE_CRDEBUG == 1
1092 opal_cr_debug_free_threads[1] = opal_thread_get_self();
1093 #endif
1094
1095
1096
1097
1098 while( !opal_cr_thread_is_active && !opal_cr_thread_is_done) {
1099 sched_yield();
1100 }
1101
1102 if( opal_cr_thread_is_done ) {
1103 return NULL;
1104 }
1105
1106
1107
1108
1109 while( opal_cr_thread_is_active && !opal_cr_thread_is_done) {
1110
1111
1112
1113
1114 OPAL_CR_THREAD_LOCK();
1115
1116 while ( !opal_cr_thread_in_library ) {
1117 sched_yield();
1118 usleep(opal_cr_thread_sleep_check);
1119
1120 OPAL_CR_TEST_CHECKPOINT_READY();
1121
1122 if( OPAL_UNLIKELY(opal_cr_currently_stalled) ) {
1123 OPAL_CR_TEST_CHECKPOINT_READY();
1124 }
1125 }
1126
1127
1128
1129
1130 OPAL_CR_THREAD_UNLOCK();
1131
1132 while ( opal_cr_thread_in_library && opal_cr_thread_is_active ) {
1133 usleep(opal_cr_thread_sleep_wait);
1134 }
1135 }
1136
1137 return NULL;
1138 }
1139
1140 void opal_cr_thread_init_library(void)
1141 {
1142 if( !opal_cr_thread_use_if_avail ) {
1143 OPAL_CR_TEST_CHECKPOINT_READY();
1144 } else {
1145
1146 opal_cr_thread_in_library = false;
1147 opal_cr_thread_is_done = false;
1148 opal_cr_thread_is_active = true;
1149 }
1150 }
1151
1152 void opal_cr_thread_finalize_library(void)
1153 {
1154 if( !opal_cr_thread_use_if_avail ) {
1155 OPAL_CR_TEST_CHECKPOINT_READY();
1156 } else {
1157
1158 opal_cr_thread_is_done = true;
1159 opal_cr_thread_is_active = false;
1160 OPAL_CR_LOCK();
1161 opal_cr_thread_in_library = true;
1162 }
1163 }
1164
1165 void opal_cr_thread_abort_library(void)
1166 {
1167 if( !opal_cr_thread_use_if_avail ) {
1168 OPAL_CR_TEST_CHECKPOINT_READY();
1169 } else {
1170
1171 opal_cr_thread_is_done = true;
1172 opal_cr_thread_is_active = false;
1173 OPAL_CR_LOCK();
1174 opal_cr_thread_in_library = true;
1175 }
1176 }
1177
1178 void opal_cr_thread_enter_library(void)
1179 {
1180 if( !opal_cr_thread_use_if_avail ) {
1181 OPAL_CR_TEST_CHECKPOINT_READY();
1182 } else {
1183
1184 OPAL_CR_LOCK();
1185 }
1186 }
1187
1188 void opal_cr_thread_exit_library(void)
1189 {
1190 if( !opal_cr_thread_use_if_avail ) {
1191 OPAL_CR_TEST_CHECKPOINT_READY();
1192 } else {
1193
1194 OPAL_CR_UNLOCK();
1195 }
1196 }
1197
1198 void opal_cr_thread_noop_progress(void)
1199 {
1200 if( !opal_cr_thread_use_if_avail ) {
1201 OPAL_CR_TEST_CHECKPOINT_READY();
1202 }
1203 }
1204
1205 #endif
1206
1207 static double opal_cr_get_time() {
1208 double wtime;
1209
1210 #if OPAL_TIMER_USEC_NATIVE
1211 wtime = (double)opal_timer_base_get_usec() / 1000000.0;
1212 #else
1213 struct timeval tv;
1214 gettimeofday(&tv, NULL);
1215 wtime = tv.tv_sec;
1216 wtime += (double)tv.tv_usec / 1000000.0;
1217 #endif
1218
1219 return wtime;
1220 }
1221
1222 void opal_cr_set_time(int idx)
1223 {
1224 if(idx < OPAL_CR_TIMER_MAX ) {
1225 if( timer_start[idx] <= 0.0 ) {
1226 timer_start[idx] = opal_cr_get_time();
1227 }
1228 }
1229 }
1230
1231 void opal_cr_clear_timers(void)
1232 {
1233 int i;
1234 for(i = 0; i < OPAL_CR_TIMER_MAX; ++i) {
1235 timer_start[i] = 0.0;
1236 }
1237 }
1238
1239 static void display_indv_timer_core(double diff, char *str) {
1240 double total = 0;
1241 double perc = 0;
1242
1243 total = timer_start[OPAL_CR_TIMER_MAX-1] - timer_start[OPAL_CR_TIMER_ENTRY0];
1244 perc = (diff/total) * 100;
1245
1246 opal_output(0,
1247 "opal_cr: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n",
1248 str,
1249 diff,
1250 total,
1251 perc);
1252 return;
1253 }
1254
1255 void opal_cr_display_all_timers(void)
1256 {
1257 double diff = 0.0;
1258 char * label = NULL;
1259
1260 if( opal_cr_timing_target_rank != opal_cr_timing_my_rank ) {
1261 return;
1262 }
1263
1264 opal_output(0, "OPAL CR Timing: ******************** Summary Begin\n");
1265
1266
1267 label = strdup("Start Entry Point");
1268 if( opal_cr_timing_barrier_enabled ) {
1269 diff = timer_start[OPAL_CR_TIMER_CRCPBR0] - timer_start[OPAL_CR_TIMER_ENTRY0];
1270 } else {
1271 diff = timer_start[OPAL_CR_TIMER_CRCP0] - timer_start[OPAL_CR_TIMER_ENTRY0];
1272 }
1273 display_indv_timer_core(diff, label);
1274 free(label);
1275
1276
1277 label = strdup("CRCP Protocol");
1278 if( opal_cr_timing_barrier_enabled ) {
1279 diff = timer_start[OPAL_CR_TIMER_CRCPBR1] - timer_start[OPAL_CR_TIMER_CRCP0];
1280 } else {
1281 diff = timer_start[OPAL_CR_TIMER_P2P0] - timer_start[OPAL_CR_TIMER_CRCP0];
1282 }
1283 display_indv_timer_core(diff, label);
1284 free(label);
1285
1286
1287 label = strdup("P2P Suspend");
1288 if( opal_cr_timing_barrier_enabled ) {
1289 diff = timer_start[OPAL_CR_TIMER_P2PBR0] - timer_start[OPAL_CR_TIMER_P2P0];
1290 } else {
1291 diff = timer_start[OPAL_CR_TIMER_CORE0] - timer_start[OPAL_CR_TIMER_P2P0];
1292 }
1293 display_indv_timer_core(diff, label);
1294 free(label);
1295
1296
1297 label = strdup("Checkpoint");
1298 diff = timer_start[OPAL_CR_TIMER_CORE1] - timer_start[OPAL_CR_TIMER_CORE0];
1299 display_indv_timer_core(diff, label);
1300 free(label);
1301
1302
1303 label = strdup("P2P Reactivation");
1304 if( opal_cr_timing_barrier_enabled ) {
1305 diff = timer_start[OPAL_CR_TIMER_P2PBR2] - timer_start[OPAL_CR_TIMER_CORE1];
1306 } else {
1307 diff = timer_start[OPAL_CR_TIMER_CRCP1] - timer_start[OPAL_CR_TIMER_CORE1];
1308 }
1309 display_indv_timer_core(diff, label);
1310 free(label);
1311
1312
1313 label = strdup("CRCP Cleanup");
1314 if( opal_cr_timing_barrier_enabled ) {
1315 diff = timer_start[OPAL_CR_TIMER_COREBR1] - timer_start[OPAL_CR_TIMER_CRCP1];
1316 } else {
1317 diff = timer_start[OPAL_CR_TIMER_CORE2] - timer_start[OPAL_CR_TIMER_CRCP1];
1318 }
1319 display_indv_timer_core(diff, label);
1320 free(label);
1321
1322
1323 label = strdup("Finish Entry Point");
1324 diff = timer_start[OPAL_CR_TIMER_ENTRY4] - timer_start[OPAL_CR_TIMER_CORE2];
1325 display_indv_timer_core(diff, label);
1326 free(label);
1327
1328 opal_output(0, "OPAL CR Timing: ******************** Summary End\n");
1329 }
1330
1331 #if OPAL_ENABLE_CRDEBUG == 1
1332 int opal_cr_debug_set_current_ckpt_thread_self(void)
1333 {
1334 int t;
1335
1336 if( NULL == opal_cr_debug_free_threads ) {
1337 opal_cr_debug_num_free_threads = 3;
1338 opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads );
1339 for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) {
1340 opal_cr_debug_free_threads[t] = NULL;
1341 }
1342 }
1343
1344 opal_cr_debug_free_threads[0] = opal_thread_get_self();
1345
1346 return OPAL_SUCCESS;
1347 }
1348
1349 int opal_cr_debug_clear_current_ckpt_thread(void)
1350 {
1351 opal_cr_debug_free_threads[0] = NULL;
1352
1353 return OPAL_SUCCESS;
1354 }
1355
1356 int MPIR_checkpoint_debugger_detach(void) {
1357
1358
1359 #if 0
1360
1361
1362 if( MPIR_debug_with_checkpoint ) {
1363 opal_cr_debug_threads_already_waiting = true;
1364 }
1365 #endif
1366 return OPAL_SUCCESS;
1367 }
1368
1369 void MPIR_checkpoint_debugger_signal_handler(int signo)
1370 {
1371 opal_output_verbose(1, opal_cr_output,
1372 "crs: MPIR_checkpoint_debugger_signal_handler(): Enter Debug signal handler...");
1373
1374 MPIR_checkpoint_debugger_waitpoint();
1375
1376 opal_output_verbose(1, opal_cr_output,
1377 "crs: MPIR_checkpoint_debugger_signal_handler(): Leave Debug signal handler...");
1378 }
1379
1380 void *MPIR_checkpoint_debugger_waitpoint(void)
1381 {
1382 int t;
1383 opal_thread_t *thr = NULL;
1384
1385 thr = opal_thread_get_self();
1386
1387
1388
1389
1390
1391 if( !MPIR_debug_with_checkpoint ) {
1392 opal_output_verbose(1, opal_cr_output,
1393 "crs: MPIR_checkpoint_debugger_waitpoint(): Debugger is not attaching... (%d)",
1394 (int)thr->t_handle);
1395 MPIR_checkpoint_debug_gate = 1;
1396 return NULL;
1397 }
1398 else {
1399 opal_output_verbose(1, opal_cr_output,
1400 "crs: MPIR_checkpoint_debugger_waitpoint(): Waiting for the Debugger to attach... (%d)",
1401 (int)thr->t_handle);
1402 MPIR_checkpoint_debug_gate = 0;
1403 }
1404
1405
1406
1407
1408 for(t = 0; t < opal_cr_debug_num_free_threads; ++t) {
1409 if( opal_cr_debug_free_threads[t] != NULL &&
1410 opal_thread_self_compare(opal_cr_debug_free_threads[t]) ) {
1411 opal_output_verbose(1, opal_cr_output,
1412 "crs: MPIR_checkpoint_debugger_waitpoint(): Checkpointing thread does not wait here... (%d)",
1413 (int)thr->t_handle);
1414 return NULL;
1415 }
1416 }
1417
1418
1419
1420
1421
1422
1423 if( opal_cr_debug_threads_already_waiting ) {
1424 opal_output_verbose(1, opal_cr_output,
1425 "crs: MPIR_checkpoint_debugger_waitpoint(): Threads are already waiting from debugger detach, do not wait here... (%d)",
1426 (int)thr->t_handle);
1427 return NULL;
1428 } else {
1429 opal_output_verbose(1, opal_cr_output,
1430 "crs: MPIR_checkpoint_debugger_waitpoint(): Wait... (%d)",
1431 (int)thr->t_handle);
1432 return MPIR_checkpoint_debugger_breakpoint();
1433 }
1434 }
1435
1436
1437
1438
1439
1440 void *MPIR_checkpoint_debugger_breakpoint(void)
1441 {
1442
1443 while (MPIR_checkpoint_debug_gate == 0) {
1444 #if defined(HAVE_USLEEP)
1445 usleep(100000);
1446 #else
1447 sleep(1);
1448 #endif
1449 }
1450 opal_cr_debug_threads_already_waiting = false;
1451 return NULL;
1452 }
1453 #endif