This source file includes following definitions.
- notify_collectives
- ompi_cr_init
- ompi_cr_finalize
- ompi_cr_coord
- ompi_cr_coord_pre_ckpt
- ompi_cr_coord_pre_restart
- ompi_cr_coord_pre_continue
- ompi_cr_coord_post_ckpt
- ompi_cr_coord_post_restart
- ompi_cr_coord_post_continue
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 #include "ompi_config.h"
  30 
  31 #include <errno.h>
  32 #ifdef HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif  
  35 #ifdef HAVE_FCNTL_H
  36 #include <fcntl.h>
  37 #endif  
  38 #ifdef HAVE_SYS_TYPES_H
  39 #include <sys/types.h>
  40 #endif  
  41 #ifdef HAVE_SYS_STAT_H
  42 #include <sys/stat.h>  
  43 #endif  
  44 
  45 #include "opal/mca/event/event.h"
  46 #include "opal/util/output.h"
  47 #include "opal/util/printf.h"
  48 #include "opal/mca/crs/crs.h"
  49 #include "opal/mca/crs/base/base.h"
  50 #include "opal/mca/installdirs/installdirs.h"
  51 #include "opal/runtime/opal_cr.h"
  52 #include "opal/mca/btl/base/base.h"
  53 
  54 #if OPAL_ENABLE_FT_CR == 1
  55 #include "orte/mca/snapc/snapc.h"
  56 #include "orte/mca/snapc/base/base.h"
  57 #endif
  58 
  59 #include "ompi/constants.h"
  60 #include "ompi/mca/pml/pml.h"
  61 #include "ompi/mca/pml/base/base.h"
  62 #include "ompi/mca/crcp/crcp.h"
  63 #include "ompi/mca/crcp/base/base.h"
  64 #include "ompi/communicator/communicator.h"
  65 #include "ompi/runtime/ompi_cr.h"
  66 #if OPAL_ENABLE_CRDEBUG == 1
  67 #include "ompi/debuggers/debuggers.h"
  68 #endif
  69 
  70 #if OPAL_ENABLE_CRDEBUG == 1
  71 OMPI_DECLSPEC int MPIR_checkpointable = 0;
  72 OMPI_DECLSPEC char * MPIR_controller_hostname = NULL;
  73 OMPI_DECLSPEC char * MPIR_checkpoint_command  = NULL;
  74 OMPI_DECLSPEC char * MPIR_restart_command     = NULL;
  75 OMPI_DECLSPEC char * MPIR_checkpoint_listing_command  = NULL;
  76 #endif
  77 
  78 
  79 
  80 
  81 static int ompi_cr_coord_pre_ckpt(void);
  82 static int ompi_cr_coord_pre_restart(void);
  83 static int ompi_cr_coord_pre_continue(void);
  84 
  85 static int ompi_cr_coord_post_ckpt(void);
  86 static int ompi_cr_coord_post_restart(void);
  87 static int ompi_cr_coord_post_continue(void);
  88 
  89 
  90 
  91 
  92 static opal_cr_coord_callback_fn_t  prev_coord_callback = NULL;
  93 
  94 int ompi_cr_output = -1;
  95 int ompi_cr_verbosity = 0;
  96 
  97 #define NUM_COLLECTIVES 16
  98 
  99 #define SIGNAL(comm, modules, highest_module, msg, ret, func)   \
 100     do {                                                        \
 101         bool found = false;                                     \
 102         int k;                                                  \
 103         mca_coll_base_module_t *my_module =                     \
 104             comm->c_coll->coll_ ## func ## _module;             \
 105         if (NULL != my_module) {                                \
 106             for (k = 0 ; k < highest_module ; ++k) {            \
 107                 if (my_module == modules[k]) found = true;      \
 108             }                                                   \
 109             if (!found) {                                       \
 110                 modules[highest_module++] = my_module;          \
 111                 if (NULL != my_module->ft_event) {              \
 112                     ret = my_module->ft_event(msg);             \
 113                     if( OMPI_SUCCESS != ret ) {                 \
 114                         return ret;                             \
 115                     }                                           \
 116                 }                                               \
 117             }                                                   \
 118         }                                                       \
 119     } while (0)
 120 
 121 
 122 static int
 123 notify_collectives(int msg)
 124 {
 125     mca_coll_base_module_t *modules[NUM_COLLECTIVES];
 126     int i, max, ret, highest_module = 0;
 127 
 128     memset(&modules, 0, sizeof(mca_coll_base_module_t*) * NUM_COLLECTIVES);
 129 
 130     max = opal_pointer_array_get_size(&ompi_mpi_communicators);
 131     for (i = 0 ; i < max ; ++i) {
 132         ompi_communicator_t *comm =
 133             (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
 134         if (NULL == comm) continue;
 135 
 136         SIGNAL(comm, modules, highest_module, msg, ret, allgather);
 137         SIGNAL(comm, modules, highest_module, msg, ret, allgatherv);
 138         SIGNAL(comm, modules, highest_module, msg, ret, allreduce);
 139         SIGNAL(comm, modules, highest_module, msg, ret, alltoall);
 140         SIGNAL(comm, modules, highest_module, msg, ret, alltoallv);
 141         SIGNAL(comm, modules, highest_module, msg, ret, alltoallw);
 142         SIGNAL(comm, modules, highest_module, msg, ret, barrier);
 143         SIGNAL(comm, modules, highest_module, msg, ret, bcast);
 144         SIGNAL(comm, modules, highest_module, msg, ret, exscan);
 145         SIGNAL(comm, modules, highest_module, msg, ret, gather);
 146         SIGNAL(comm, modules, highest_module, msg, ret, gatherv);
 147         SIGNAL(comm, modules, highest_module, msg, ret, reduce);
 148         SIGNAL(comm, modules, highest_module, msg, ret, reduce_scatter);
 149         SIGNAL(comm, modules, highest_module, msg, ret, scan);
 150         SIGNAL(comm, modules, highest_module, msg, ret, scatter);
 151         SIGNAL(comm, modules, highest_module, msg, ret, scatterv);
 152     }
 153 
 154     return OMPI_SUCCESS;
 155 }
 156 
 157 
 158 
 159 
 160 
 161 int ompi_cr_init(void)
 162 {
 163     
 164 
 165 
 166     ompi_cr_verbosity = 0;
 167     (void) mca_base_var_register("ompi", "ompi", "cr", "verbose",
 168                                  "Verbose output for the OMPI Checkpoint/Restart functionality",
 169                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 170                                  OPAL_INFO_LVL_9,
 171                                  MCA_BASE_VAR_SCOPE_READONLY,
 172                                  &ompi_cr_verbosity);
 173     if(0 != ompi_cr_verbosity) {
 174         ompi_cr_output = opal_output_open(NULL);
 175         opal_output_set_verbosity(ompi_cr_output, ompi_cr_verbosity);
 176     } else {
 177         ompi_cr_output = opal_cr_output;
 178     }
 179 
 180     opal_output_verbose(10, ompi_cr_output,
 181                         "ompi_cr: init: ompi_cr_init()");
 182 
 183     
 184     opal_cr_reg_coord_callback(ompi_cr_coord, &prev_coord_callback);
 185 
 186 #if OPAL_ENABLE_CRDEBUG == 1
 187     
 188     if( MPIR_debug_with_checkpoint ) {
 189         char *uri = NULL;
 190         char *sep = NULL;
 191         char *hostname = NULL;
 192 
 193         
 194         MPIR_checkpointable = 1;
 195 
 196         
 197         
 198         opal_asprintf(&MPIR_checkpoint_command,
 199                  "%s/ompi-checkpoint --crdebug --hnp-jobid %u",
 200                  opal_install_dirs.bindir,
 201                  ORTE_PROC_MY_HNP->jobid);
 202         opal_asprintf(&MPIR_restart_command,
 203                  "%s/ompi-restart --crdebug ",
 204                  opal_install_dirs.bindir);
 205         opal_asprintf(&MPIR_checkpoint_listing_command,
 206                  "%s/ompi-checkpoint -l --crdebug ",
 207                  opal_install_dirs.bindir);
 208 
 209         
 210         uri = strdup(ompi_process_info.my_hnp_uri);
 211         hostname = strchr(uri, ';') + 1;
 212         sep = strchr(hostname, ';');
 213         if (sep) {
 214             *sep = 0;
 215         }
 216         if (strncmp(hostname, "tcp://", 6) == 0) {
 217             hostname += 6;
 218             sep = strchr(hostname, ':');
 219             *sep = 0;
 220             MPIR_controller_hostname = strdup(hostname);
 221         } else {
 222             MPIR_controller_hostname = strdup("localhost");
 223         }
 224 
 225         
 226         if( NULL != uri ) {
 227             free(uri);
 228             uri = NULL;
 229         }
 230     }
 231 #endif
 232 
 233     return OMPI_SUCCESS;
 234 }
 235 
 236 
 237 
 238 
 239 int ompi_cr_finalize(void)
 240 {
 241     opal_output_verbose(10, ompi_cr_output,
 242                         "ompi_cr: finalize: ompi_cr_finalize()");
 243 
 244     return OMPI_SUCCESS;
 245 }
 246 
 247 
 248 
 249 
 250 int ompi_cr_coord(int state)
 251 {
 252     int ret, exit_status = OMPI_SUCCESS;
 253 
 254     opal_output_verbose(10, ompi_cr_output,
 255                         "ompi_cr: coord: ompi_cr_coord(%s)\n",
 256                         opal_crs_base_state_str((opal_crs_state_type_t)state));
 257 
 258     
 259 
 260 
 261 
 262     if(OPAL_CRS_CHECKPOINT == state) {
 263         
 264         ret = ompi_cr_coord_pre_ckpt();
 265         if( ret == OMPI_EXISTS) {
 266             return ret;
 267         }
 268         else if( ret != OMPI_SUCCESS) {
 269             return ret;
 270         }
 271     }
 272     else if (OPAL_CRS_CONTINUE == state ) {
 273         
 274         ompi_cr_coord_pre_continue();
 275     }
 276     else if (OPAL_CRS_RESTART == state ) {
 277         
 278         ompi_cr_coord_pre_restart();
 279     }
 280     else if (OPAL_CRS_TERM == state ) {
 281         
 282     }
 283     else {
 284         
 285 
 286 
 287     }
 288 
 289     
 290 
 291 
 292     if(OMPI_SUCCESS != (ret = prev_coord_callback(state)) ) {
 293         exit_status = ret;
 294         goto cleanup;
 295     }
 296 
 297 
 298     
 299 
 300 
 301 
 302     if(OPAL_CRS_CHECKPOINT == state) {
 303         
 304         ompi_cr_coord_post_ckpt();
 305     }
 306     else if (OPAL_CRS_CONTINUE == state ) {
 307         
 308         ompi_cr_coord_post_continue();
 309 
 310 #if OPAL_ENABLE_CRDEBUG == 1
 311         
 312 
 313 
 314 
 315         if( MPIR_debug_with_checkpoint ) {
 316             MPIR_checkpoint_debugger_breakpoint();
 317         }
 318 #endif
 319     }
 320     else if (OPAL_CRS_RESTART == state ) {
 321         
 322         ompi_cr_coord_post_restart();
 323 
 324 #if OPAL_ENABLE_CRDEBUG == 1
 325         
 326 
 327 
 328 
 329         if( MPIR_debug_with_checkpoint ) {
 330             MPIR_checkpoint_debugger_breakpoint();
 331         }
 332 #endif
 333     }
 334     else if (OPAL_CRS_TERM == state ) {
 335         
 336     }
 337     else {
 338         
 339 
 340 
 341     }
 342 
 343  cleanup:
 344     return exit_status;
 345 }
 346 
 347 
 348 
 349 
 350 static int ompi_cr_coord_pre_ckpt(void) {
 351     int ret, exit_status = OMPI_SUCCESS;
 352 
 353     
 354 
 355 
 356     opal_output_verbose(10, ompi_cr_output,
 357                         "ompi_cr: coord_pre_ckpt: ompi_cr_coord_pre_ckpt()\n");
 358 
 359     
 360 
 361 
 362 
 363 
 364     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CR_CHECKPOINT))) {
 365         goto cleanup;
 366     }
 367 
 368     
 369 
 370 
 371 
 372     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CHECKPOINT))) {
 373         exit_status = ret;
 374         goto cleanup;
 375     }
 376 
 377  cleanup:
 378 
 379     return exit_status;
 380 }
 381 
 382 static int ompi_cr_coord_pre_restart(void) {
 383     int ret, exit_status = OMPI_SUCCESS;
 384 
 385     opal_output_verbose(10, ompi_cr_output,
 386                         "ompi_cr: coord_pre_restart: ompi_cr_coord_pre_restart()");
 387 
 388     
 389 
 390 
 391 
 392 
 393 
 394 
 395     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART_PRE))) {
 396         exit_status = ret;
 397         goto cleanup;
 398     }
 399 
 400  cleanup:
 401     return exit_status;
 402 }
 403 
 404 static int ompi_cr_coord_pre_continue(void) {
 405 #if OPAL_ENABLE_FT_CR == 1
 406     int ret, exit_status = OMPI_SUCCESS;
 407 
 408     
 409 
 410 
 411 
 412     opal_output_verbose(10, ompi_cr_output,
 413                         "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
 414 
 415     if (opal_cr_continue_like_restart) {
 416         
 417         if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
 418             exit_status = ret;
 419             goto cleanup;
 420         }
 421     }
 422     else {
 423         if( opal_cr_timing_barrier_enabled ) {
 424             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
 425         }
 426         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
 427         if( opal_cr_timing_barrier_enabled ) {
 428             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
 429         }
 430         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
 431     }
 432 
 433  cleanup:
 434     return exit_status;
 435 #else
 436     return OMPI_SUCCESS;
 437 #endif
 438 }
 439 
 440 
 441 
 442 
 443 static int ompi_cr_coord_post_ckpt(void) {
 444     
 445 
 446 
 447 
 448     opal_output_verbose(10, ompi_cr_output,
 449                         "ompi_cr: coord_post_ckpt: ompi_cr_coord_post_ckpt()");
 450 
 451     return OMPI_SUCCESS;
 452 }
 453 
 454 static int ompi_cr_coord_post_restart(void) {
 455     int ret, exit_status = OMPI_SUCCESS;
 456 
 457     opal_output_verbose(10, ompi_cr_output,
 458                         "ompi_cr: coord_post_restart: ompi_cr_coord_post_restart()");
 459 
 460     
 461 
 462 
 463 
 464     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART))) {
 465         exit_status = ret;
 466         goto cleanup;
 467     }
 468 
 469     
 470 
 471 
 472 
 473 
 474     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_RESTART))) {
 475         goto cleanup;
 476     }
 477 
 478  cleanup:
 479 
 480     return exit_status;
 481 }
 482 
 483 static int ompi_cr_coord_post_continue(void) {
 484     int ret, exit_status = OMPI_SUCCESS;
 485 
 486     opal_output_verbose(10, ompi_cr_output,
 487                         "ompi_cr: coord_post_continue: ompi_cr_coord_post_continue()");
 488 
 489     
 490 
 491 
 492 
 493     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
 494         exit_status = ret;
 495         goto cleanup;
 496     }
 497 
 498     
 499 
 500 
 501 
 502 
 503     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_CONTINUE))) {
 504         goto cleanup;
 505     }
 506 
 507  cleanup:
 508 
 509     return exit_status;
 510 }