root/ompi/runtime/ompi_cr.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. notify_collectives
  2. ompi_cr_init
  3. ompi_cr_finalize
  4. ompi_cr_coord
  5. ompi_cr_coord_pre_ckpt
  6. ompi_cr_coord_pre_restart
  7. ompi_cr_coord_pre_continue
  8. ompi_cr_coord_post_ckpt
  9. ompi_cr_coord_post_restart
  10. ompi_cr_coord_post_continue

   1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2012      The University of Wisconsin-La Crosse. All rights
  14  *                         reserved.
  15  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  */
  22 
  23 /** @file
  24  *
  25  * OMPI Layer Checkpoint/Restart Runtime functions
  26  *
  27  */
  28 
  29 #include "ompi_config.h"
  30 
  31 #include <errno.h>
  32 #ifdef HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif  /* HAVE_UNISTD_H */
  35 #ifdef HAVE_FCNTL_H
  36 #include <fcntl.h>
  37 #endif  /* HAVE_FCNTL_H */
  38 #ifdef HAVE_SYS_TYPES_H
  39 #include <sys/types.h>
  40 #endif  /* HAVE_SYS_TYPES_H */
  41 #ifdef HAVE_SYS_STAT_H
  42 #include <sys/stat.h>  /* for mkfifo */
  43 #endif  /* HAVE_SYS_STAT_H */
  44 
  45 #include "opal/mca/event/event.h"
  46 #include "opal/util/output.h"
  47 #include "opal/util/printf.h"
  48 #include "opal/mca/crs/crs.h"
  49 #include "opal/mca/crs/base/base.h"
  50 #include "opal/mca/installdirs/installdirs.h"
  51 #include "opal/runtime/opal_cr.h"
  52 #include "opal/mca/btl/base/base.h"
  53 
  54 #if OPAL_ENABLE_FT_CR == 1
  55 #include "orte/mca/snapc/snapc.h"
  56 #include "orte/mca/snapc/base/base.h"
  57 #endif
  58 
  59 #include "ompi/constants.h"
  60 #include "ompi/mca/pml/pml.h"
  61 #include "ompi/mca/pml/base/base.h"
  62 #include "ompi/mca/crcp/crcp.h"
  63 #include "ompi/mca/crcp/base/base.h"
  64 #include "ompi/communicator/communicator.h"
  65 #include "ompi/runtime/ompi_cr.h"
  66 #if OPAL_ENABLE_CRDEBUG == 1
  67 #include "ompi/debuggers/debuggers.h"
  68 #endif
  69 
  70 #if OPAL_ENABLE_CRDEBUG == 1
  71 OMPI_DECLSPEC int MPIR_checkpointable = 0;
  72 OMPI_DECLSPEC char * MPIR_controller_hostname = NULL;
  73 OMPI_DECLSPEC char * MPIR_checkpoint_command  = NULL;
  74 OMPI_DECLSPEC char * MPIR_restart_command     = NULL;
  75 OMPI_DECLSPEC char * MPIR_checkpoint_listing_command  = NULL;
  76 #endif
  77 
  78 /*************
  79  * Local functions
  80  *************/
  81 static int ompi_cr_coord_pre_ckpt(void);
  82 static int ompi_cr_coord_pre_restart(void);
  83 static int ompi_cr_coord_pre_continue(void);
  84 
  85 static int ompi_cr_coord_post_ckpt(void);
  86 static int ompi_cr_coord_post_restart(void);
  87 static int ompi_cr_coord_post_continue(void);
  88 
  89 /*************
  90  * Local vars
  91  *************/
  92 static opal_cr_coord_callback_fn_t  prev_coord_callback = NULL;
  93 
  94 int ompi_cr_output = -1;
  95 int ompi_cr_verbosity = 0;
  96 
  97 #define NUM_COLLECTIVES 16
  98 
  99 #define SIGNAL(comm, modules, highest_module, msg, ret, func)   \
 100     do {                                                        \
 101         bool found = false;                                     \
 102         int k;                                                  \
 103         mca_coll_base_module_t *my_module =                     \
 104             comm->c_coll->coll_ ## func ## _module;             \
 105         if (NULL != my_module) {                                \
 106             for (k = 0 ; k < highest_module ; ++k) {            \
 107                 if (my_module == modules[k]) found = true;      \
 108             }                                                   \
 109             if (!found) {                                       \
 110                 modules[highest_module++] = my_module;          \
 111                 if (NULL != my_module->ft_event) {              \
 112                     ret = my_module->ft_event(msg);             \
 113                     if( OMPI_SUCCESS != ret ) {                 \
 114                         return ret;                             \
 115                     }                                           \
 116                 }                                               \
 117             }                                                   \
 118         }                                                       \
 119     } while (0)
 120 
 121 
 122 static int
 123 notify_collectives(int msg)
 124 {
 125     mca_coll_base_module_t *modules[NUM_COLLECTIVES];
 126     int i, max, ret, highest_module = 0;
 127 
 128     memset(&modules, 0, sizeof(mca_coll_base_module_t*) * NUM_COLLECTIVES);
 129 
 130     max = opal_pointer_array_get_size(&ompi_mpi_communicators);
 131     for (i = 0 ; i < max ; ++i) {
 132         ompi_communicator_t *comm =
 133             (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
 134         if (NULL == comm) continue;
 135 
 136         SIGNAL(comm, modules, highest_module, msg, ret, allgather);
 137         SIGNAL(comm, modules, highest_module, msg, ret, allgatherv);
 138         SIGNAL(comm, modules, highest_module, msg, ret, allreduce);
 139         SIGNAL(comm, modules, highest_module, msg, ret, alltoall);
 140         SIGNAL(comm, modules, highest_module, msg, ret, alltoallv);
 141         SIGNAL(comm, modules, highest_module, msg, ret, alltoallw);
 142         SIGNAL(comm, modules, highest_module, msg, ret, barrier);
 143         SIGNAL(comm, modules, highest_module, msg, ret, bcast);
 144         SIGNAL(comm, modules, highest_module, msg, ret, exscan);
 145         SIGNAL(comm, modules, highest_module, msg, ret, gather);
 146         SIGNAL(comm, modules, highest_module, msg, ret, gatherv);
 147         SIGNAL(comm, modules, highest_module, msg, ret, reduce);
 148         SIGNAL(comm, modules, highest_module, msg, ret, reduce_scatter);
 149         SIGNAL(comm, modules, highest_module, msg, ret, scan);
 150         SIGNAL(comm, modules, highest_module, msg, ret, scatter);
 151         SIGNAL(comm, modules, highest_module, msg, ret, scatterv);
 152     }
 153 
 154     return OMPI_SUCCESS;
 155 }
 156 
 157 
 158 /*
 159  * CR Init
 160  */
 161 int ompi_cr_init(void)
 162 {
 163     /*
 164      * Register some MCA variables
 165      */
 166     ompi_cr_verbosity = 0;
 167     (void) mca_base_var_register("ompi", "ompi", "cr", "verbose",
 168                                  "Verbose output for the OMPI Checkpoint/Restart functionality",
 169                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 170                                  OPAL_INFO_LVL_9,
 171                                  MCA_BASE_VAR_SCOPE_READONLY,
 172                                  &ompi_cr_verbosity);
 173     if(0 != ompi_cr_verbosity) {
 174         ompi_cr_output = opal_output_open(NULL);
 175         opal_output_set_verbosity(ompi_cr_output, ompi_cr_verbosity);
 176     } else {
 177         ompi_cr_output = opal_cr_output;
 178     }
 179 
 180     opal_output_verbose(10, ompi_cr_output,
 181                         "ompi_cr: init: ompi_cr_init()");
 182 
 183     /* Register the OMPI interlevel coordination callback */
 184     opal_cr_reg_coord_callback(ompi_cr_coord, &prev_coord_callback);
 185 
 186 #if OPAL_ENABLE_CRDEBUG == 1
 187     /* Check for C/R enabled debugging */
 188     if( MPIR_debug_with_checkpoint ) {
 189         char *uri = NULL;
 190         char *sep = NULL;
 191         char *hostname = NULL;
 192 
 193         /* Mark as debuggable with C/R */
 194         MPIR_checkpointable = 1;
 195 
 196         /* Set the checkpoint and restart commands */
 197         /* Add the full path to the binary */
 198         opal_asprintf(&MPIR_checkpoint_command,
 199                  "%s/ompi-checkpoint --crdebug --hnp-jobid %u",
 200                  opal_install_dirs.bindir,
 201                  ORTE_PROC_MY_HNP->jobid);
 202         opal_asprintf(&MPIR_restart_command,
 203                  "%s/ompi-restart --crdebug ",
 204                  opal_install_dirs.bindir);
 205         opal_asprintf(&MPIR_checkpoint_listing_command,
 206                  "%s/ompi-checkpoint -l --crdebug ",
 207                  opal_install_dirs.bindir);
 208 
 209         /* Set contact information for HNP */
 210         uri = strdup(ompi_process_info.my_hnp_uri);
 211         hostname = strchr(uri, ';') + 1;
 212         sep = strchr(hostname, ';');
 213         if (sep) {
 214             *sep = 0;
 215         }
 216         if (strncmp(hostname, "tcp://", 6) == 0) {
 217             hostname += 6;
 218             sep = strchr(hostname, ':');
 219             *sep = 0;
 220             MPIR_controller_hostname = strdup(hostname);
 221         } else {
 222             MPIR_controller_hostname = strdup("localhost");
 223         }
 224 
 225         /* Cleanup */
 226         if( NULL != uri ) {
 227             free(uri);
 228             uri = NULL;
 229         }
 230     }
 231 #endif
 232 
 233     return OMPI_SUCCESS;
 234 }
 235 
 236 /*
 237  * Finalize
 238  */
 239 int ompi_cr_finalize(void)
 240 {
 241     opal_output_verbose(10, ompi_cr_output,
 242                         "ompi_cr: finalize: ompi_cr_finalize()");
 243 
 244     return OMPI_SUCCESS;
 245 }
 246 
 247 /*
 248  * Interlayer coordination callback
 249  */
 250 int ompi_cr_coord(int state)
 251 {
 252     int ret, exit_status = OMPI_SUCCESS;
 253 
 254     opal_output_verbose(10, ompi_cr_output,
 255                         "ompi_cr: coord: ompi_cr_coord(%s)\n",
 256                         opal_crs_base_state_str((opal_crs_state_type_t)state));
 257 
 258     /*
 259      * Before calling the previous callback, we have the opportunity to
 260      * take action given the state.
 261      */
 262     if(OPAL_CRS_CHECKPOINT == state) {
 263         /* Do Checkpoint Phase work */
 264         ret = ompi_cr_coord_pre_ckpt();
 265         if( ret == OMPI_EXISTS) {
 266             return ret;
 267         }
 268         else if( ret != OMPI_SUCCESS) {
 269             return ret;
 270         }
 271     }
 272     else if (OPAL_CRS_CONTINUE == state ) {
 273         /* Do Continue Phase work */
 274         ompi_cr_coord_pre_continue();
 275     }
 276     else if (OPAL_CRS_RESTART == state ) {
 277         /* Do Restart Phase work */
 278         ompi_cr_coord_pre_restart();
 279     }
 280     else if (OPAL_CRS_TERM == state ) {
 281         /* Do Continue Phase work in prep to terminate the application */
 282     }
 283     else {
 284         /* We must have been in an error state from the checkpoint
 285          * recreate everything, as in the Continue Phase
 286          */
 287     }
 288 
 289     /*
 290      * Call the previous callback, which should be ORTE [which will handle OPAL]
 291      */
 292     if(OMPI_SUCCESS != (ret = prev_coord_callback(state)) ) {
 293         exit_status = ret;
 294         goto cleanup;
 295     }
 296 
 297 
 298     /*
 299      * After calling the previous callback, we have the opportunity to
 300      * take action given the state to tidy up.
 301      */
 302     if(OPAL_CRS_CHECKPOINT == state) {
 303         /* Do Checkpoint Phase work */
 304         ompi_cr_coord_post_ckpt();
 305     }
 306     else if (OPAL_CRS_CONTINUE == state ) {
 307         /* Do Continue Phase work */
 308         ompi_cr_coord_post_continue();
 309 
 310 #if OPAL_ENABLE_CRDEBUG == 1
 311         /*
 312          * If C/R enabled debugging,
 313          * wait here for debugger to attach
 314          */
 315         if( MPIR_debug_with_checkpoint ) {
 316             MPIR_checkpoint_debugger_breakpoint();
 317         }
 318 #endif
 319     }
 320     else if (OPAL_CRS_RESTART == state ) {
 321         /* Do Restart Phase work */
 322         ompi_cr_coord_post_restart();
 323 
 324 #if OPAL_ENABLE_CRDEBUG == 1
 325         /*
 326          * If C/R enabled debugging,
 327          * wait here for debugger to attach
 328          */
 329         if( MPIR_debug_with_checkpoint ) {
 330             MPIR_checkpoint_debugger_breakpoint();
 331         }
 332 #endif
 333     }
 334     else if (OPAL_CRS_TERM == state ) {
 335         /* Do Continue Phase work in prep to terminate the application */
 336     }
 337     else {
 338         /* We must have been in an error state from the checkpoint
 339          * recreate everything, as in the Continue Phase
 340          */
 341     }
 342 
 343  cleanup:
 344     return exit_status;
 345 }
 346 
 347 /*************
 348  * Pre Lower Layer
 349  *************/
 350 static int ompi_cr_coord_pre_ckpt(void) {
 351     int ret, exit_status = OMPI_SUCCESS;
 352 
 353     /*
 354      * All the checkpoint heavey lifting in here...
 355      */
 356     opal_output_verbose(10, ompi_cr_output,
 357                         "ompi_cr: coord_pre_ckpt: ompi_cr_coord_pre_ckpt()\n");
 358 
 359     /*
 360      * Notify Collectives
 361      * - Need to do this on a per communicator basis
 362      *   Traverse all communicators...
 363      */
 364     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CR_CHECKPOINT))) {
 365         goto cleanup;
 366     }
 367 
 368     /*
 369      * Notify PML
 370      *  - Will notify BML and BTL's
 371      */
 372     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CHECKPOINT))) {
 373         exit_status = ret;
 374         goto cleanup;
 375     }
 376 
 377  cleanup:
 378 
 379     return exit_status;
 380 }
 381 
 382 static int ompi_cr_coord_pre_restart(void) {
 383     int ret, exit_status = OMPI_SUCCESS;
 384 
 385     opal_output_verbose(10, ompi_cr_output,
 386                         "ompi_cr: coord_pre_restart: ompi_cr_coord_pre_restart()");
 387 
 388     /*
 389      * Notify PML
 390      *  - Will notify BML and BTL's
 391      *  - The intention here is to have the PML shutdown all the old components
 392      *    and handles. On the second pass (once ORTE is restarted) we can
 393      *    reconnect processes.
 394      */
 395     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART_PRE))) {
 396         exit_status = ret;
 397         goto cleanup;
 398     }
 399 
 400  cleanup:
 401     return exit_status;
 402 }
 403 
 404 static int ompi_cr_coord_pre_continue(void) {
 405 #if OPAL_ENABLE_FT_CR == 1
 406     int ret, exit_status = OMPI_SUCCESS;
 407 
 408     /*
 409      * Can not really do much until ORTE is up and running,
 410      * so defer action until the post_continue function.
 411      */
 412     opal_output_verbose(10, ompi_cr_output,
 413                         "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
 414 
 415     if (opal_cr_continue_like_restart) {
 416         /* Mimic ompi_cr_coord_pre_restart(); */
 417         if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
 418             exit_status = ret;
 419             goto cleanup;
 420         }
 421     }
 422     else {
 423         if( opal_cr_timing_barrier_enabled ) {
 424             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
 425         }
 426         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
 427         if( opal_cr_timing_barrier_enabled ) {
 428             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
 429         }
 430         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
 431     }
 432 
 433  cleanup:
 434     return exit_status;
 435 #else
 436     return OMPI_SUCCESS;
 437 #endif
 438 }
 439 
 440 /*************
 441  * Post Lower Layer
 442  *************/
 443 static int ompi_cr_coord_post_ckpt(void) {
 444     /*
 445      * Now that ORTE/OPAL are shutdown, we really can't do much
 446      * so assume pre_ckpt took care of everything.
 447      */
 448     opal_output_verbose(10, ompi_cr_output,
 449                         "ompi_cr: coord_post_ckpt: ompi_cr_coord_post_ckpt()");
 450 
 451     return OMPI_SUCCESS;
 452 }
 453 
 454 static int ompi_cr_coord_post_restart(void) {
 455     int ret, exit_status = OMPI_SUCCESS;
 456 
 457     opal_output_verbose(10, ompi_cr_output,
 458                         "ompi_cr: coord_post_restart: ompi_cr_coord_post_restart()");
 459 
 460     /*
 461      * Notify PML
 462      *  - Will notify BML and BTL's
 463      */
 464     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART))) {
 465         exit_status = ret;
 466         goto cleanup;
 467     }
 468 
 469     /*
 470      * Notify Collectives
 471      * - Need to do this on a per communicator basis
 472      *   Traverse all communicators...
 473      */
 474     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_RESTART))) {
 475         goto cleanup;
 476     }
 477 
 478  cleanup:
 479 
 480     return exit_status;
 481 }
 482 
 483 static int ompi_cr_coord_post_continue(void) {
 484     int ret, exit_status = OMPI_SUCCESS;
 485 
 486     opal_output_verbose(10, ompi_cr_output,
 487                         "ompi_cr: coord_post_continue: ompi_cr_coord_post_continue()");
 488 
 489     /*
 490      * Notify PML
 491      *  - Will notify BML and BTL's
 492      */
 493     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
 494         exit_status = ret;
 495         goto cleanup;
 496     }
 497 
 498     /*
 499      * Notify Collectives
 500      * - Need to do this on a per communicator basis
 501      *   Traverse all communicators...
 502      */
 503     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_CONTINUE))) {
 504         goto cleanup;
 505     }
 506 
 507  cleanup:
 508 
 509     return exit_status;
 510 }

/* [<][>][^][v][top][bottom][index][help] */