root/orte/mca/ess/pmi/ess_pmi_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. rte_init
  2. rte_finalize
  3. rte_abort

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2008-2012 Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
  14  *                         All rights reserved.
  15  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
  16  * Copyright (c) 2016-2017 Research Organization for Information Science
  17  *                         and Technology (RIST). All rights reserved.
  18  * Copyright (c) 2018      Mellanox Technologies, Inc.
  19  *                         All rights reserved.
  20  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  21  * $COPYRIGHT$
  22  *
  23  * Additional copyrights may follow
  24  *
  25  * $HEADER$
  26  *
  27  */
  28 
  29 #include "orte_config.h"
  30 #include "orte/constants.h"
  31 
  32 #ifdef HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif  /* HAVE_UNISTD_H */
  35 #include <string.h>
  36 #include <ctype.h>
  37 #ifdef HAVE_NETDB_H
  38 #include <netdb.h>
  39 #endif
  40 #ifdef HAVE_IFADDRS_H
  41 #include <ifaddrs.h>
  42 #endif
  43 #include <sys/mman.h>
  44 #include <errno.h>
  45 #include <fcntl.h>
  46 
  47 #include "opal/util/opal_environ.h"
  48 #include "opal/util/output.h"
  49 #include "opal/util/arch.h"
  50 #include "opal/util/argv.h"
  51 #include "opal/runtime/opal_progress_threads.h"
  52 #include "opal/class/opal_pointer_array.h"
  53 #include "opal/mca/hwloc/base/base.h"
  54 #include "opal/util/printf.h"
  55 #include "opal/util/proc.h"
  56 #include "opal/mca/pmix/pmix.h"
  57 #include "opal/mca/pmix/base/base.h"
  58 #include "opal/util/timings.h"
  59 
  60 #include "orte/mca/errmgr/base/base.h"
  61 #include "orte/mca/filem/base/base.h"
  62 #include "orte/mca/grpcomm/grpcomm.h"
  63 #include "orte/mca/rml/rml.h"
  64 #include "orte/mca/rml/base/rml_contact.h"
  65 #include "orte/mca/schizo/schizo.h"
  66 #include "orte/mca/state/base/base.h"
  67 #include "orte/util/proc_info.h"
  68 #include "orte/util/session_dir.h"
  69 #include "orte/util/show_help.h"
  70 #include "orte/util/name_fns.h"
  71 #include "orte/util/pre_condition_transports.h"
  72 #include "orte/runtime/orte_globals.h"
  73 #include "orte/runtime/orte_wait.h"
  74 
  75 #include "orte/mca/ess/ess.h"
  76 #include "orte/mca/ess/base/base.h"
  77 #include "orte/mca/ess/pmi/ess_pmi.h"
  78 
  79 static int rte_init(void);
  80 static int rte_finalize(void);
  81 static void rte_abort(int error_code, bool report);
  82 
  83 orte_ess_base_module_t orte_ess_pmi_module = {
  84     rte_init,
  85     rte_finalize,
  86     rte_abort,
  87     NULL /* ft_event */
  88 };
  89 
  90 static bool added_transport_keys=false;
  91 static bool added_num_procs = false;
  92 static bool added_app_ctx = false;
  93 static bool progress_thread_running = false;
  94 static bool direct_launched = false;
  95 
  96 /****    MODULE FUNCTIONS    ****/
  97 
  98 static int rte_init(void)
  99 {
 100     int ret;
 101     char *error = NULL;
 102     char *envar, *ev1, *ev2;
 103     uint64_t unique_key[2];
 104     char *string_key;
 105     opal_value_t *kv;
 106     char *val;
 107     int u32, *u32ptr;
 108     uint16_t u16, *u16ptr;
 109     char **peers=NULL, *mycpuset;
 110     opal_process_name_t wildcard_rank, pname;
 111     bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false;
 112     size_t i;
 113 
 114     OPAL_TIMING_ENV_INIT(rte_init);
 115 
 116     /* run the prolog */
 117     if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
 118         error = "orte_ess_base_std_prolog";
 119         goto error;
 120     }
 121     OPAL_TIMING_ENV_NEXT(rte_init, "orte_ess_base_std_prolog");
 122 
 123     /* get an async event base - we use the opal_async one so
 124      * we don't startup extra threads if not needed */
 125     orte_event_base = opal_progress_thread_init(NULL);
 126     progress_thread_running = true;
 127     OPAL_TIMING_ENV_NEXT(rte_init, "progress_thread_init");
 128 
 129     /* open and setup pmix */
 130     if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
 131         ORTE_ERROR_LOG(ret);
 132         /* we cannot run */
 133         error = "pmix init";
 134         goto error;
 135     }
 136     if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
 137         /* we cannot run */
 138         error = "pmix init";
 139         goto error;
 140     }
 141     /* set the event base */
 142     opal_pmix_base_set_evbase(orte_event_base);
 143     OPAL_TIMING_ENV_NEXT(rte_init, "pmix_framework_open");
 144 
 145     /* see if we were direct launched */
 146     if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment()) {
 147         direct_launched = true;
 148     }
 149 
 150     /* initialize the selected module */
 151     if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
 152         /* we cannot run - this could be due to being direct launched
 153          * without the required PMI support being built. Try to detect
 154          * that scenario and warn the user */
 155         if (direct_launched && NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
 156             if (0 == strcmp(envar, "SLURM")) {
 157                 /* yes to both - so emit a hopefully helpful
 158                  * error message and abort */
 159                 orte_show_help_finalize();
 160                 orte_show_help("help-ess-base.txt", "slurm-error", true);
 161                 return ORTE_ERR_SILENT;
 162             } else if (0 == strcmp(envar, "ALPS")) {
 163                 /* we were direct launched by ALPS */
 164                 orte_show_help_finalize();
 165                 orte_show_help("help-ess-base.txt", "alps-error", true);
 166                 return ORTE_ERR_SILENT;
 167             }
 168         }
 169         error = "pmix init";
 170         goto error;
 171     }
 172     u32ptr = &u32;
 173     u16ptr = &u16;
 174 
 175     /****   THE FOLLOWING ARE REQUIRED VALUES   ***/
 176     /* pmix.init set our process name down in the OPAL layer,
 177      * so carry it forward here */
 178     ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
 179     ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
 180 
 181     /* setup a name for retrieving data associated with the job */
 182     wildcard_rank.jobid = ORTE_PROC_MY_NAME->jobid;
 183     wildcard_rank.vpid = ORTE_NAME_WILDCARD->vpid;
 184 
 185     /* setup a name for retrieving proc-specific data */
 186     pname.jobid = ORTE_PROC_MY_NAME->jobid;
 187     pname.vpid = 0;
 188 
 189     OPAL_TIMING_ENV_NEXT(rte_init, "pmix_init");
 190 
 191     /* get our local rank from PMI */
 192     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
 193                           ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
 194     if (OPAL_SUCCESS != ret) {
 195         error = "getting local rank";
 196         goto error;
 197     }
 198     orte_process_info.my_local_rank = u16;
 199 
 200     /* get our node rank from PMI */
 201     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
 202                           ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
 203     if (OPAL_SUCCESS != ret) {
 204         error = "getting node rank";
 205         goto error;
 206     }
 207     orte_process_info.my_node_rank = u16;
 208 
 209     /* get max procs for this application */
 210     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS,
 211                           &wildcard_rank, &u32ptr, OPAL_UINT32);
 212     if (OPAL_SUCCESS != ret) {
 213         error = "getting max procs";
 214         goto error;
 215     }
 216     orte_process_info.max_procs = u32;
 217 
 218     /* get job size */
 219     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE,
 220                           &wildcard_rank, &u32ptr, OPAL_UINT32);
 221     if (OPAL_SUCCESS != ret) {
 222         error = "getting job size";
 223         goto error;
 224     }
 225     orte_process_info.num_procs = u32;
 226 
 227     /* push into the environ for pickup in MPI layer for
 228      * MPI-3 required info key
 229      */
 230     if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
 231         opal_asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs);
 232         putenv(ev1);
 233         added_num_procs = true;
 234     }
 235     if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
 236         opal_asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs);
 237         putenv(ev2);
 238         added_app_ctx = true;
 239     }
 240 
 241 
 242     /* get our app number from PMI - ok if not found */
 243     OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
 244                                    ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
 245     if (OPAL_SUCCESS == ret) {
 246         orte_process_info.app_num = u32;
 247     } else {
 248         orte_process_info.app_num = 0;
 249     }
 250 
 251     /* get the number of local peers - required for wireup of
 252      * shared memory BTL */
 253     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE,
 254                           &wildcard_rank, &u32ptr, OPAL_UINT32);
 255     if (OPAL_SUCCESS == ret) {
 256         orte_process_info.num_local_peers = u32 - 1;  // want number besides ourselves
 257     } else {
 258         orte_process_info.num_local_peers = 0;
 259     }
 260 
 261     /* get number of nodes in the job */
 262     OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NUM_NODES,
 263                                    &wildcard_rank, &u32ptr, OPAL_UINT32);
 264     if (OPAL_SUCCESS == ret) {
 265         orte_process_info.num_nodes = u32;
 266     }
 267     OPAL_TIMING_ENV_NEXT(rte_init, "pmix_get_job_params");
 268 
 269     /* setup transport keys in case the MPI layer needs them -
 270      * we can use the jobfam and stepid as unique keys
 271      * because they are unique values assigned by the RM
 272      */
 273     if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
 274         unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
 275         unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid);
 276         if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
 277             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 278             return ORTE_ERR_OUT_OF_RESOURCE;
 279         }
 280         opal_output_verbose(2, orte_ess_base_framework.framework_output,
 281                             "%s transport key %s",
 282                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key);
 283         opal_asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
 284         putenv(envar);
 285         added_transport_keys = true;
 286         /* cannot free the envar as that messes up our environ */
 287         free(string_key);
 288     }
 289     OPAL_TIMING_ENV_NEXT(rte_init, "orte_precondition_transport");
 290 
 291     /* retrieve temp directories info */
 292     OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING);
 293     if (OPAL_SUCCESS == ret && NULL != val) {
 294         /* We want to provide user with ability
 295          * to override RM settings at his own risk
 296          */
 297         if( NULL == orte_process_info.top_session_dir ){
 298             orte_process_info.top_session_dir = val;
 299         } else {
 300             /* keep the MCA setting */
 301             tdir_mca_override = true;
 302             free(val);
 303         }
 304         val = NULL;
 305     }
 306 
 307     if( !tdir_mca_override ){
 308         OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING);
 309         if (OPAL_SUCCESS == ret && NULL != val) {
 310             /* We want to provide user with ability
 311              * to override RM settings at his own risk
 312              */
 313             if( NULL == orte_process_info.job_session_dir ){
 314                 orte_process_info.job_session_dir = val;
 315             } else {
 316                 /* keep the MCA setting */
 317                 free(val);
 318                 tdir_mca_override = true;
 319             }
 320             val = NULL;
 321         }
 322     }
 323 
 324     if( !tdir_mca_override ){
 325         OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING);
 326         if (OPAL_SUCCESS == ret && NULL != val) {
 327             /* We want to provide user with ability
 328              * to override RM settings at his own risk
 329              */
 330             if( NULL == orte_process_info.proc_session_dir ){
 331                 orte_process_info.proc_session_dir = val;
 332             } else {
 333                 /* keep the MCA setting */
 334                 tdir_mca_override = true;
 335                 free(val);
 336             }
 337             val = NULL;
 338         }
 339     }
 340 
 341     if( !tdir_mca_override ){
 342         OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL);
 343         if (OPAL_SUCCESS == ret ) {
 344             orte_process_info.rm_session_dirs = bool_val;
 345         }
 346     }
 347     OPAL_TIMING_ENV_NEXT(rte_init, "pmix_set_tdirs");
 348 
 349     /* get our local peers */
 350     if (0 < orte_process_info.num_local_peers) {
 351         /* if my local rank if too high, then that's an error */
 352         if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) {
 353             ret = ORTE_ERR_BAD_PARAM;
 354             error = "num local peers";
 355             goto error;
 356         }
 357         /* retrieve the local peers */
 358         OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
 359                               &wildcard_rank, &val, OPAL_STRING);
 360         if (OPAL_SUCCESS == ret && NULL != val) {
 361             peers = opal_argv_split(val, ',');
 362             free(val);
 363         } else {
 364             peers = NULL;
 365         }
 366     } else {
 367         peers = NULL;
 368     }
 369 
 370     /* set the locality */
 371     if (NULL != peers) {
 372         /* identify our location */
 373         val = NULL;
 374         OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
 375                                        ORTE_PROC_MY_NAME, &val, OPAL_STRING);
 376         if (OPAL_SUCCESS == ret && NULL != val) {
 377             mycpuset = val;
 378         } else {
 379             mycpuset = NULL;
 380         }
 381         pname.jobid = ORTE_PROC_MY_NAME->jobid;
 382         for (i=0; NULL != peers[i]; i++) {
 383             pname.vpid = strtoul(peers[i], NULL, 10);
 384             if (pname.vpid == ORTE_PROC_MY_NAME->vpid) {
 385                 /* we are fully local to ourselves */
 386                 u16 = OPAL_PROC_ALL_LOCAL;
 387             } else {
 388                 val = NULL;
 389                 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
 390                                                &pname, &val, OPAL_STRING);
 391                 if (OPAL_SUCCESS == ret && NULL != val) {
 392                     u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
 393                     free(val);
 394                 } else {
 395                     /* all we can say is that it shares our node */
 396                     u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
 397                 }
 398             }
 399             kv = OBJ_NEW(opal_value_t);
 400             kv->key = strdup(OPAL_PMIX_LOCALITY);
 401             kv->type = OPAL_UINT16;
 402             OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
 403                                  "%s ess:pmi:locality: proc %s locality %s",
 404                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 405                                  ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16)));
 406             kv->data.uint16 = u16;
 407             ret = opal_pmix.store_local(&pname, kv);
 408             if (OPAL_SUCCESS != ret) {
 409                 error = "local store of locality";
 410                 opal_argv_free(peers);
 411                 if (NULL != mycpuset) {
 412                     free(mycpuset);
 413                 }
 414                 goto error;
 415             }
 416             OBJ_RELEASE(kv);
 417         }
 418         opal_argv_free(peers);
 419         if (NULL != mycpuset) {
 420             free(mycpuset);
 421         }
 422     }
 423     OPAL_TIMING_ENV_NEXT(rte_init, "pmix_set_locality");
 424 
 425     /* now that we have all required info, complete the setup */
 426     /*
 427      * stdout/stderr buffering
 428      * If the user requested to override the default setting then do
 429      * as they wish.
 430      */
 431     if( orte_ess_base_std_buffering > -1 ) {
 432         if( 0 == orte_ess_base_std_buffering ) {
 433             setvbuf(stdout, NULL, _IONBF, 0);
 434             setvbuf(stderr, NULL, _IONBF, 0);
 435         }
 436         else if( 1 == orte_ess_base_std_buffering ) {
 437             setvbuf(stdout, NULL, _IOLBF, 0);
 438             setvbuf(stderr, NULL, _IOLBF, 0);
 439         }
 440         else if( 2 == orte_ess_base_std_buffering ) {
 441             setvbuf(stdout, NULL, _IOFBF, 0);
 442             setvbuf(stderr, NULL, _IOFBF, 0);
 443         }
 444     }
 445 
 446     /* if I am an MPI app, we will let the MPI layer define and
 447      * control the opal_proc_t structure. Otherwise, we need to
 448      * do so here */
 449     if (ORTE_PROC_NON_MPI) {
 450         orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
 451         orte_process_info.super.proc_hostname = orte_process_info.nodename;
 452         orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
 453         orte_process_info.super.proc_arch = opal_local_arch;
 454         opal_proc_local_set(&orte_process_info.super);
 455     }
 456 
 457     /* open and setup the state machine */
 458     if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
 459         ORTE_ERROR_LOG(ret);
 460         error = "orte_state_base_open";
 461         goto error;
 462     }
 463     if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
 464         ORTE_ERROR_LOG(ret);
 465         error = "orte_state_base_select";
 466         goto error;
 467     }
 468     OPAL_TIMING_ENV_NEXT(rte_init, "state_framework_open");
 469 
 470     /* open the errmgr */
 471     if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
 472         ORTE_ERROR_LOG(ret);
 473         error = "orte_errmgr_base_open";
 474         goto error;
 475     }
 476     OPAL_TIMING_ENV_NEXT(rte_init, "errmgr_framework_open");
 477 
 478     /* setup my session directory */
 479     if (orte_create_session_dirs) {
 480         OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
 481                              "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
 482                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 483                              (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
 484                              orte_process_info.nodename));
 485         if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
 486             ORTE_ERROR_LOG(ret);
 487             error = "orte_session_dir";
 488             goto error;
 489         }
 490         /* Once the session directory location has been established, set
 491            the opal_output env file location to be in the
 492            proc-specific session directory. */
 493         opal_output_set_output_file_info(orte_process_info.proc_session_dir,
 494                                          "output-", NULL, NULL);
 495         /* register the directory for cleanup */
 496         if (NULL != opal_pmix.register_cleanup) {
 497             if (orte_standalone_operation) {
 498                 if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, true, false, true))) {
 499                     ORTE_ERROR_LOG(ret);
 500                     error = "register cleanup";
 501                     goto error;
 502                 }
 503             } else {
 504                 if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.job_session_dir, true, false, false))) {
 505                     ORTE_ERROR_LOG(ret);
 506                     error = "register cleanup";
 507                     goto error;
 508                 }
 509             }
 510         }
 511     }
 512     OPAL_TIMING_ENV_NEXT(rte_init, "create_session_dirs");
 513 
 514     /* if we have info on the HNP and local daemon, process it */
 515     if (NULL != orte_process_info.my_hnp_uri) {
 516         /* we have to set the HNP's name, even though we won't route messages directly
 517          * to it. This is required to ensure that we -do- send messages to the correct
 518          * HNP name
 519          */
 520         if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
 521                                                             ORTE_PROC_MY_HNP, NULL))) {
 522             ORTE_ERROR_LOG(ret);
 523             error = "orte_rml_parse_HNP";
 524             goto error;
 525         }
 526     }
 527     if (NULL != orte_process_info.my_daemon_uri) {
 528         opal_value_t val;
 529 
 530         /* extract the daemon's name so we can update the routing table */
 531         if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
 532                                                             ORTE_PROC_MY_DAEMON, NULL))) {
 533             ORTE_ERROR_LOG(ret);
 534             error = "orte_rml_parse_daemon";
 535             goto error;
 536         }
 537         /* Set the contact info in the database - this won't actually establish
 538          * the connection, but just tells us how to reach the daemon
 539          * if/when we attempt to send to it
 540          */
 541         OBJ_CONSTRUCT(&val, opal_value_t);
 542         val.key = OPAL_PMIX_PROC_URI;
 543         val.type = OPAL_STRING;
 544         val.data.string = orte_process_info.my_daemon_uri;
 545         if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_DAEMON, &val))) {
 546             ORTE_ERROR_LOG(ret);
 547             val.key = NULL;
 548             val.data.string = NULL;
 549             OBJ_DESTRUCT(&val);
 550             error = "store DAEMON URI";
 551             goto error;
 552         }
 553         val.key = NULL;
 554         val.data.string = NULL;
 555         OBJ_DESTRUCT(&val);
 556     }
 557 
 558     /* setup the errmgr */
 559     if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
 560         ORTE_ERROR_LOG(ret);
 561         error = "orte_errmgr_base_select";
 562         goto error;
 563     }
 564     OPAL_TIMING_ENV_NEXT(rte_init, "errmgr_select");
 565 
 566     /* setup process binding */
 567     if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
 568         error = "proc_binding";
 569         goto error;
 570     }
 571     OPAL_TIMING_ENV_NEXT(rte_init, "ess_base_proc_binding");
 572 
 573     /* this needs to be set to enable debugger use when direct launched */
 574     if (NULL == orte_process_info.my_daemon_uri) {
 575         orte_standalone_operation = true;
 576     }
 577 
 578     /* set max procs */
 579     if (orte_process_info.max_procs < orte_process_info.num_procs) {
 580         orte_process_info.max_procs = orte_process_info.num_procs;
 581     }
 582 
 583     /* push our hostname so others can find us, if they need to - the
 584      * native PMIx component will ignore this request as the hostname
 585      * is provided by the system */
 586     OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
 587     if (ORTE_SUCCESS != ret) {
 588         error = "db store hostname";
 589         goto error;
 590     }
 591 
 592     /* if we are an ORTE app - and not an MPI app - then
 593      * we need to exchange our connection info here.
 594      * MPI_Init has its own modex, so we don't need to do
 595      * two of them. However, if we don't do a modex at all,
 596      * then processes have no way to communicate
 597      *
 598      * NOTE: only do this when the process originally launches.
 599      * Cannot do this on a restart as the rest of the processes
 600      * in the job won't be executing this step, so we would hang
 601      */
 602     if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
 603         /* need to commit the data before we fence */
 604         opal_pmix.commit();
 605         if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
 606             error = "opal_pmix.fence() failed";
 607             goto error;
 608         }
 609     }
 610     OPAL_TIMING_ENV_NEXT(rte_init, "rte_init_done");
 611 
 612     return ORTE_SUCCESS;
 613 
 614  error:
 615     if (!progress_thread_running) {
 616         /* can't send the help message, so ensure it
 617          * comes out locally
 618          */
 619         orte_show_help_finalize();
 620     }
 621     if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
 622         orte_show_help("help-orte-runtime.txt",
 623                        "orte_init:startup:internal-failure",
 624                        true, error, ORTE_ERROR_NAME(ret), ret);
 625     }
 626     return ret;
 627 }
 628 
 629 static int rte_finalize(void)
 630 {
 631     /* remove the envars that we pushed into environ
 632      * so we leave that structure intact
 633      */
 634     if (added_transport_keys) {
 635         unsetenv(OPAL_MCA_PREFIX"orte_precondition_transports");
 636     }
 637     if (added_num_procs) {
 638         unsetenv(OPAL_MCA_PREFIX"orte_ess_num_procs");
 639     }
 640     if (added_app_ctx) {
 641         unsetenv("OMPI_APP_CTX_NUM_PROCS");
 642     }
 643 
 644     /* close frameworks */
 645     (void) mca_base_framework_close(&orte_filem_base_framework);
 646     (void) mca_base_framework_close(&orte_errmgr_base_framework);
 647 
 648     if (NULL != opal_pmix.finalize) {
 649         opal_pmix.finalize();
 650         (void) mca_base_framework_close(&opal_pmix_base_framework);
 651     }
 652     (void) mca_base_framework_close(&orte_state_base_framework);
 653 
 654     if (direct_launched) {
 655         orte_session_dir_finalize(ORTE_PROC_MY_NAME);
 656     }
 657     /* cleanup the process info */
 658     orte_proc_info_finalize();
 659 
 660     /* release the event base */
 661     if (progress_thread_running) {
 662         opal_progress_thread_finalize(NULL);
 663         progress_thread_running = false;
 664     }
 665     return ORTE_SUCCESS;
 666 }
 667 
 668 static void rte_abort(int status, bool report)
 669 {
 670     struct timespec tp = {0, 100000};
 671 
 672     OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
 673                          "%s ess:pmi:abort: abort with status %d",
 674                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 675                          status));
 676 
 677     /* PMI doesn't like NULL messages, but our interface
 678      * doesn't provide one - so rig one up here
 679      */
 680     opal_pmix.abort(status, "N/A", NULL);
 681 
 682     /* provide a little delay for the PMIx thread to
 683      * get the info out */
 684     nanosleep(&tp, NULL);
 685 
 686     /* Now Exit */
 687     _exit(status);
 688 }

/* [<][>][^][v][top][bottom][index][help] */