root/ompi/runtime/ompi_mpi_finalize.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fence_cbfunc
  2. ompi_mpi_finalize

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2018 Cisco Systems, Inc.  All rights reserved
  14  * Copyright (c) 2006-2014 Los Alamos National Security, LLC.  All rights
  15  *                         reserved.
  16  * Copyright (c) 2006      University of Houston. All rights reserved.
  17  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
  18  * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
  19  * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
  20  * Copyright (c) 2016      Research Organization for Information Science
  21  *                         and Technology (RIST). All rights reserved.
  22  *
  23  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
  24  * $COPYRIGHT$
  25  *
  26  * Additional copyrights may follow
  27  *
  28  * $HEADER$
  29  */
  30 
  31 #include "ompi_config.h"
  32 
  33 #ifdef HAVE_SYS_TYPES_H
  34 #include <sys/types.h>
  35 #endif
  36 #ifdef HAVE_UNISTD_H
  37 #include <unistd.h>
  38 #endif
  39 #ifdef HAVE_SYS_PARAM_H
  40 #include <sys/param.h>
  41 #endif
  42 #ifdef HAVE_NETDB_H
  43 #include <netdb.h>
  44 #endif
  45 
  46 #include "opal/mca/event/event.h"
  47 #include "opal/util/output.h"
  48 #include "opal/runtime/opal_progress.h"
  49 #include "opal/mca/base/base.h"
  50 #include "opal/sys/atomic.h"
  51 #include "opal/runtime/opal.h"
  52 #include "opal/util/show_help.h"
  53 #include "opal/mca/mpool/base/base.h"
  54 #include "opal/mca/mpool/base/mpool_base_tree.h"
  55 #include "opal/mca/rcache/base/base.h"
  56 #include "opal/mca/allocator/base/base.h"
  57 #include "opal/mca/pmix/pmix.h"
  58 #include "opal/util/timings.h"
  59 
  60 #include "mpi.h"
  61 #include "ompi/constants.h"
  62 #include "ompi/errhandler/errcode.h"
  63 #include "ompi/communicator/communicator.h"
  64 #include "ompi/datatype/ompi_datatype.h"
  65 #include "ompi/message/message.h"
  66 #include "ompi/op/op.h"
  67 #include "ompi/file/file.h"
  68 #include "ompi/info/info.h"
  69 #include "ompi/runtime/mpiruntime.h"
  70 #include "ompi/attribute/attribute.h"
  71 #include "ompi/mca/pml/pml.h"
  72 #include "ompi/mca/bml/bml.h"
  73 #include "ompi/mca/pml/base/base.h"
  74 #include "ompi/mca/bml/base/base.h"
  75 #include "ompi/mca/osc/base/base.h"
  76 #include "ompi/mca/coll/base/base.h"
  77 #include "ompi/mca/rte/rte.h"
  78 #include "ompi/mca/rte/base/base.h"
  79 #include "ompi/mca/topo/base/base.h"
  80 #include "ompi/mca/io/io.h"
  81 #include "ompi/mca/io/base/base.h"
  82 #include "ompi/mca/pml/base/pml_base_bsend.h"
  83 #include "ompi/runtime/params.h"
  84 #include "ompi/dpm/dpm.h"
  85 #include "ompi/mpiext/mpiext.h"
  86 #include "ompi/mca/hook/base/base.h"
  87 
  88 #if OPAL_ENABLE_FT_CR == 1
  89 #include "ompi/mca/crcp/crcp.h"
  90 #include "ompi/mca/crcp/base/base.h"
  91 #endif
  92 #include "ompi/runtime/ompi_cr.h"
  93 
  94 extern bool ompi_enable_timing;
  95 
  96 static void fence_cbfunc(int status, void *cbdata)
  97 {
  98     volatile bool *active = (volatile bool*)cbdata;
  99     OPAL_ACQUIRE_OBJECT(active);
 100     *active = false;
 101     OPAL_POST_OBJECT(active);
 102 }
 103 
 104 int ompi_mpi_finalize(void)
 105 {
 106     int ret = MPI_SUCCESS;
 107     opal_list_item_t *item;
 108     ompi_proc_t** procs;
 109     size_t nprocs;
 110     volatile bool active;
 111     uint32_t key;
 112     ompi_datatype_t * datatype;
 113 
 114     ompi_hook_base_mpi_finalize_top();
 115 
 116     int32_t state = ompi_mpi_state;
 117     if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
 118         state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
 119         /* Note that if we're not initialized or already finalized, we
 120            cannot raise an MPI exception.  The best that we can do is
 121            write something to stderr. */
 122         char hostname[OPAL_MAXHOSTNAMELEN];
 123         pid_t pid = getpid();
 124         gethostname(hostname, sizeof(hostname));
 125 
 126         if (state < OMPI_MPI_STATE_INIT_COMPLETED) {
 127             opal_show_help("help-mpi-runtime.txt",
 128                            "mpi_finalize: not initialized",
 129                            true, hostname, pid);
 130         } else if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
 131             opal_show_help("help-mpi-runtime.txt",
 132                            "mpi_finalize:invoked_multiple_times",
 133                            true, hostname, pid);
 134         }
 135         return MPI_ERR_OTHER;
 136     }
 137     opal_atomic_wmb();
 138     opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_STARTED);
 139 
 140     ompi_mpiext_fini();
 141 
 142     /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing
 143        anything else in MPI_FINALIZE (to include setting up such that
 144        MPI_FINALIZED will return true). */
 145 
 146     if (NULL != ompi_mpi_comm_self.comm.c_keyhash) {
 147         ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self,
 148                              ompi_mpi_comm_self.comm.c_keyhash);
 149         OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash);
 150         ompi_mpi_comm_self.comm.c_keyhash = NULL;
 151     }
 152 
 153     /* Mark that we are past COMM_SELF destruction so that
 154        MPI_FINALIZED can return an accurate value (per MPI-3.1,
 155        FINALIZED needs to return FALSE to MPI_FINALIZED until after
 156        COMM_SELF is destroyed / all the attribute callbacks have been
 157        invoked) */
 158     opal_atomic_wmb();
 159     opal_atomic_swap_32(&ompi_mpi_state,
 160                         OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
 161 
 162     /* As finalize is the last legal MPI call, we are allowed to force the release
 163      * of the user buffer used for bsend, before going anywhere further.
 164      */
 165     (void)mca_pml_base_bsend_detach(NULL, NULL);
 166 
 167 #if OPAL_ENABLE_PROGRESS_THREADS == 0
 168     opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
 169 #endif
 170 
 171     /* Redo ORTE calling opal_progress_event_users_increment() during
 172        MPI lifetime, to get better latency when not using TCP */
 173     opal_progress_event_users_increment();
 174 
 175     /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
 176        *all* connected processes.  This only means that all processes
 177        have to call it.  It does *not* mean that all connected
 178        processes need to synchronize (either directly or indirectly).
 179 
 180        For example, it is quite easy to construct complicated
 181        scenarios where one job is "connected" to another job via
 182        transitivity, but have no direct knowledge of each other.
 183        Consider the following case: job A spawns job B, and job B
 184        later spawns job C.  A "connectedness" graph looks something
 185        like this:
 186 
 187            A <--> B <--> C
 188 
 189        So what are we *supposed* to do in this case?  If job A is
 190        still connected to B when it calls FINALIZE, should it block
 191        until jobs B and C also call FINALIZE?
 192 
 193        After lengthy discussions many times over the course of this
 194        project, the issue was finally decided at the Louisville Feb
 195        2009 meeting: no.
 196 
 197        Rationale:
 198 
 199        - "Collective" does not mean synchronizing.  It only means that
 200          every process call it.  Hence, in this scenario, every
 201          process in A, B, and C must call FINALIZE.
 202 
 203        - KEY POINT: if A calls FINALIZE, then it is erroneous for B or
 204          C to try to communicate with A again.
 205 
 206        - Hence, OMPI is *correct* to only effect a barrier across each
 207          jobs' MPI_COMM_WORLD before exiting.  Specifically, if A
 208          calls FINALIZE long before B or C, it's *correct* if A exits
 209          at any time (and doesn't notify B or C that it is exiting).
 210 
 211        - Arguably, if B or C do try to communicate with the now-gone
 212          A, OMPI should try to print a nice error ("you tried to
 213          communicate with a job that is already gone...") instead of
 214          segv or other Badness.  However, that is an *extremely*
 215          difficult problem -- sure, it's easy for A to tell B that it
 216          is finalizing, but how can A tell C?  A doesn't even know
 217          about C.  You'd need to construct a "connected" graph in a
 218          distributed fashion, which is fraught with race conditions,
 219          etc.
 220 
 221       Hence, our conclusion is: OMPI is *correct* in its current
 222       behavior (of only doing a barrier across its own COMM_WORLD)
 223       before exiting.  Any problems that occur are as a result of
 224       erroneous MPI applications.  We *could* tighten up the erroneous
 225       cases and ensure that we print nice error messages / don't
 226       crash, but that is such a difficult problem that we decided we
 227       have many other, much higher priority issues to handle that deal
 228       with non-erroneous cases. */
 229 
 230     /* Wait for everyone to reach this point.  This is a PMIx
 231        barrier instead of an MPI barrier for (at least) two reasons:
 232 
 233        1. An MPI barrier doesn't ensure that all messages have been
 234           transmitted before exiting (e.g., a BTL can lie and buffer a
 235           message without actually injecting it to the network, and
 236           therefore require further calls to that BTL's progress), so
 237           the possibility of a stranded message exists.
 238 
 239        2. If the MPI communication is using an unreliable transport,
 240           there's a problem of knowing that everyone has *left* the
 241           barrier.  E.g., one proc can send its ACK to the barrier
 242           message to a peer and then leave the barrier, but the ACK
 243           can get lost and therefore the peer is left in the barrier.
 244 
 245        Point #1 has been known for a long time; point #2 emerged after
 246        we added the first unreliable BTL to Open MPI and fixed the
 247        del_procs behavior around May of 2014 (see
 248        https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for
 249        more details). */
 250     if (!ompi_async_mpi_finalize) {
 251         if (NULL != opal_pmix.fence_nb) {
 252             active = true;
 253             OPAL_POST_OBJECT(&active);
 254             /* Note that use of the non-blocking PMIx fence will
 255              * allow us to lazily cycle calling
 256              * opal_progress(), which will allow any other pending
 257              * communications/actions to complete.  See
 258              * https://github.com/open-mpi/ompi/issues/1576 for the
 259              * original bug report. */
 260             if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
 261                                                           (void*)&active))) {
 262                 OMPI_ERROR_LOG(ret);
 263                 /* Reset the active flag to false, to avoid waiting for
 264                  * completion when the fence was failed. */
 265                 active = false;
 266             }
 267             OMPI_LAZY_WAIT_FOR_COMPLETION(active);
 268         } else {
 269             /* However, we cannot guarantee that the provided PMIx has
 270              * fence_nb.  If it doesn't, then do the best we can: an MPI
 271              * barrier on COMM_WORLD (which isn't the best because of the
 272              * reasons cited above), followed by a blocking PMIx fence
 273              * (which does not call opal_progress()). */
 274             ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
 275             comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
 276 
 277             if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
 278                 OMPI_ERROR_LOG(ret);
 279             }
 280         }
 281     }
 282 
 283     /*
 284      * Shutdown the Checkpoint/Restart Mech.
 285      */
 286     if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) {
 287         OMPI_ERROR_LOG(ret);
 288     }
 289 
 290     /* Shut down any bindings-specific issues: C++, F77, F90 */
 291 
 292     /* Remove all memory associated by MPI_REGISTER_DATAREP (per
 293        MPI-2:9.5.3, there is no way for an MPI application to
 294        *un*register datareps, but we don't want the OMPI layer causing
 295        memory leaks). */
 296     while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) {
 297         OBJ_RELEASE(item);
 298     }
 299     OBJ_DESTRUCT(&ompi_registered_datareps);
 300 
 301     /* Remove all F90 types from the hash tables */
 302     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_integer_hashtable)
 303         OBJ_RELEASE(datatype);
 304     OBJ_DESTRUCT(&ompi_mpi_f90_integer_hashtable);
 305     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_real_hashtable)
 306         OBJ_RELEASE(datatype);
 307     OBJ_DESTRUCT(&ompi_mpi_f90_real_hashtable);
 308     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_complex_hashtable)
 309         OBJ_RELEASE(datatype);
 310     OBJ_DESTRUCT(&ompi_mpi_f90_complex_hashtable);
 311 
 312     /* Free communication objects */
 313 
 314     /* free file resources */
 315     if (OMPI_SUCCESS != (ret = ompi_file_finalize())) {
 316         goto done;
 317     }
 318 
 319     /* free window resources */
 320     if (OMPI_SUCCESS != (ret = ompi_win_finalize())) {
 321         goto done;
 322     }
 323     if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) {
 324         goto done;
 325     }
 326 
 327     /* free communicator resources. this MUST come before finalizing the PML
 328      * as this will call into the pml */
 329     if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) {
 330         goto done;
 331     }
 332 
 333     /* call del_procs on all allocated procs even though some may not be known
 334      * to the pml layer. the pml layer is expected to be resilient and ignore
 335      * any unknown procs. */
 336     nprocs = 0;
 337     procs = ompi_proc_get_allocated (&nprocs);
 338     MCA_PML_CALL(del_procs(procs, nprocs));
 339     free(procs);
 340 
 341     /* free pml resource */
 342     if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) {
 343         goto done;
 344     }
 345 
 346     /* free requests */
 347     if (OMPI_SUCCESS != (ret = ompi_request_finalize())) {
 348         goto done;
 349     }
 350 
 351     if (OMPI_SUCCESS != (ret = ompi_message_finalize())) {
 352         goto done;
 353     }
 354 
 355     /* If requested, print out a list of memory allocated by ALLOC_MEM
 356        but not freed by FREE_MEM */
 357     if (0 != ompi_debug_show_mpi_alloc_mem_leaks) {
 358         mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks);
 359     }
 360 
 361     /* Now that all MPI objects dealing with communications are gone,
 362        shut down MCA types having to do with communications */
 363     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) {
 364         OMPI_ERROR_LOG(ret);
 365         goto done;
 366     }
 367 
 368     /* shut down buffered send code */
 369     mca_pml_base_bsend_fini();
 370 
 371 #if OPAL_ENABLE_FT_CR == 1
 372     /*
 373      * Shutdown the CRCP Framework, must happen after PML shutdown
 374      */
 375     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) {
 376         OMPI_ERROR_LOG(ret);
 377         goto done;
 378     }
 379 #endif
 380 
 381     /* Free secondary resources */
 382 
 383     /* free attr resources */
 384     if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) {
 385         goto done;
 386     }
 387 
 388     /* free group resources */
 389     if (OMPI_SUCCESS != (ret = ompi_group_finalize())) {
 390         goto done;
 391     }
 392 
 393     /* finalize the DPM subsystem */
 394     if ( OMPI_SUCCESS != (ret = ompi_dpm_finalize())) {
 395         goto done;
 396     }
 397 
 398     /* free internal error resources */
 399     if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) {
 400         goto done;
 401     }
 402 
 403     /* free error code resources */
 404     if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) {
 405         goto done;
 406     }
 407 
 408     /* free errhandler resources */
 409     if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) {
 410         goto done;
 411     }
 412 
 413     /* Free all other resources */
 414 
 415     /* free op resources */
 416     if (OMPI_SUCCESS != (ret = ompi_op_finalize())) {
 417         goto done;
 418     }
 419 
 420     /* free ddt resources */
 421     if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) {
 422         goto done;
 423     }
 424 
 425     /* free info resources */
 426     if (OMPI_SUCCESS != (ret = ompi_mpiinfo_finalize())) {
 427         goto done;
 428     }
 429 
 430     /* Close down MCA modules */
 431 
 432     /* io is opened lazily, so it's only necessary to close it if it
 433        was actually opened */
 434     if (0 < ompi_io_base_framework.framework_refcnt) {
 435         /* May have been "opened" multiple times. We want it closed now */
 436         ompi_io_base_framework.framework_refcnt = 1;
 437 
 438         if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) {
 439             goto done;
 440         }
 441     }
 442     (void) mca_base_framework_close(&ompi_topo_base_framework);
 443     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) {
 444         goto done;
 445     }
 446     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) {
 447         goto done;
 448     }
 449     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) {
 450         goto done;
 451     }
 452     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) {
 453         goto done;
 454     }
 455     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) {
 456         goto done;
 457     }
 458     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) {
 459         goto done;
 460     }
 461 
 462     /* free proc resources */
 463     if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) {
 464         goto done;
 465     }
 466 
 467     if (NULL != ompi_mpi_main_thread) {
 468         OBJ_RELEASE(ompi_mpi_main_thread);
 469         ompi_mpi_main_thread = NULL;
 470     }
 471 
 472     /* Clean up memory/resources from the MPI dynamic process
 473        functionality checker */
 474     ompi_mpi_dynamics_finalize();
 475 
 476     /* Leave the RTE */
 477 
 478     if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {
 479         goto done;
 480     }
 481     ompi_rte_initialized = false;
 482 
 483     /* now close the rte framework */
 484     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) {
 485         OMPI_ERROR_LOG(ret);
 486         goto done;
 487     }
 488 
 489     /* Now close the hook framework */
 490     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_hook_base_framework) ) ) {
 491         OMPI_ERROR_LOG(ret);
 492         goto done;
 493     }
 494 
 495     if (OPAL_SUCCESS != (ret = opal_finalize_util())) {
 496         goto done;
 497     }
 498 
 499     if (0 == opal_initialized) {
 500         /* if there is no MPI_T_init_thread that has been MPI_T_finalize'd,
 501          * then be gentle to the app and release all the memory now (instead
 502          * of the opal library destructor */
 503         opal_class_finalize();
 504     }
 505 
 506     /* cleanup environment */
 507     opal_unsetenv("OMPI_COMMAND", &environ);
 508     opal_unsetenv("OMPI_ARGV", &environ);
 509 
 510     /* All done */
 511 
 512   done:
 513     opal_atomic_wmb();
 514     opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_COMPLETED);
 515 
 516     ompi_hook_base_mpi_finalize_bottom();
 517 
 518     return ret;
 519 }

/* [<][>][^][v][top][bottom][index][help] */