root/opal/runtime/opal_init.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. opal_err2str
  2. opal_init_psm
  3. opal_init_error
  4. opal_init_util
  5. opal_init

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007-2016 Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2007      Sun Microsystems, Inc.  All rights reserved.
  15  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
  16  * Copyright (c) 2010-2015 Los Alamos National Security, LLC.
  17  *                         All rights reserved.
  18  * Copyright (c) 2013-2019 Intel, Inc.  All rights reserved.
  19  * Copyright (c) 2015-2017 Research Organization for Information Science
  20  *                         and Technology (RIST). All rights reserved.
  21  * Copyright (c) 2017      Amazon.com, Inc. or its affiliates.
  22  *                         All Rights reserved.
  23  * Copyright (c) 2018      Mellanox Technologies, Inc.
  24  *                         All rights reserved.
  25  * Copyright (c) 2018-2019 Triad National Security, LLC. All rights
  26  *                         reserved.
  27  * $COPYRIGHT$
  28  *
  29  * Additional copyrights may follow
  30  *
  31  * $HEADER$
  32  */
  33 
  34 /** @file **/
  35 
  36 #ifdef HAVE_UNISTD_H
  37 #include <unistd.h>
  38 #endif
  39 
  40 #include "opal_config.h"
  41 
  42 #include "opal/util/malloc.h"
  43 #include "opal/util/arch.h"
  44 #include "opal/util/output.h"
  45 #include "opal/util/show_help.h"
  46 #include "opal/util/proc.h"
  47 #include "opal/memoryhooks/memory.h"
  48 #include "opal/mca/base/base.h"
  49 #include "opal/mca/base/mca_base_var.h"
  50 #include "opal/runtime/opal.h"
  51 #include "opal/util/net.h"
  52 #include "opal/datatype/opal_datatype.h"
  53 #include "opal/mca/installdirs/base/base.h"
  54 #include "opal/mca/memory/base/base.h"
  55 #include "opal/mca/patcher/base/base.h"
  56 #include "opal/mca/memcpy/base/base.h"
  57 #include "opal/mca/hwloc/base/base.h"
  58 #include "opal/mca/reachable/base/base.h"
  59 #include "opal/mca/timer/base/base.h"
  60 #include "opal/mca/memchecker/base/base.h"
  61 #include "opal/mca/if/base/base.h"
  62 #include "opal/dss/dss.h"
  63 #include "opal/mca/shmem/base/base.h"
  64 #include "opal/mca/compress/base/base.h"
  65 #include "opal/threads/threads.h"
  66 #include "opal/threads/tsd.h"
  67 
  68 #include "opal/runtime/opal_cr.h"
  69 #include "opal/mca/crs/base/base.h"
  70 
  71 #include "opal/runtime/opal_progress.h"
  72 #include "opal/mca/event/base/base.h"
  73 #include "opal/mca/backtrace/base/base.h"
  74 
  75 #include "opal/constants.h"
  76 #include "opal/util/error.h"
  77 #include "opal/util/stacktrace.h"
  78 #include "opal/util/keyval_parse.h"
  79 #include "opal/util/sys_limits.h"
  80 #include "opal/util/timings.h"
  81 
  82 #if OPAL_CC_USE_PRAGMA_IDENT
  83 #pragma ident OPAL_IDENT_STRING
  84 #elif OPAL_CC_USE_IDENT
  85 #ident OPAL_IDENT_STRING
  86 #endif
  87 const char opal_version_string[] = OPAL_IDENT_STRING;
  88 
  89 int opal_initialized = 0;
  90 bool opal_init_called = false;
  91 int opal_util_initialized = 0;
  92 /* We have to put a guess in here in case hwloc is not available.  If
  93    hwloc is available, this value will be overwritten when the
  94    hwloc data is loaded. */
  95 int opal_cache_line_size = 128;
  96 bool opal_warn_on_fork = true;
  97 
  98 static int
  99 opal_err2str(int errnum, const char **errmsg)
 100 {
 101     const char *retval;
 102 
 103     switch (errnum) {
 104     case OPAL_SUCCESS:
 105         retval = "Success";
 106         break;
 107     case OPAL_ERROR:
 108         retval = "Error";
 109         break;
 110     case OPAL_ERR_OUT_OF_RESOURCE:
 111         retval = "Out of resource";
 112         break;
 113     case OPAL_ERR_TEMP_OUT_OF_RESOURCE:
 114         retval = "Temporarily out of resource";
 115         break;
 116     case OPAL_ERR_RESOURCE_BUSY:
 117         retval = "Resource busy";
 118         break;
 119     case OPAL_ERR_BAD_PARAM:
 120         retval = "Bad parameter";
 121         break;
 122     case OPAL_ERR_FATAL:
 123         retval = "Fatal";
 124         break;
 125     case OPAL_ERR_NOT_IMPLEMENTED:
 126         retval = "Not implemented";
 127         break;
 128     case OPAL_ERR_NOT_SUPPORTED:
 129         retval = "Not supported";
 130         break;
 131     case OPAL_ERR_INTERRUPTED:
 132         retval = "Interrupted";
 133         break;
 134     case OPAL_ERR_WOULD_BLOCK:
 135         retval = "Would block";
 136         break;
 137     case OPAL_ERR_IN_ERRNO:
 138         retval = "In errno";
 139         break;
 140     case OPAL_ERR_UNREACH:
 141         retval = "Unreachable";
 142         break;
 143     case OPAL_ERR_NOT_FOUND:
 144         retval = "Not found";
 145         break;
 146     case OPAL_EXISTS:
 147         retval = "Exists";
 148         break;
 149     case OPAL_ERR_TIMEOUT:
 150         retval = "Timeout";
 151         break;
 152     case OPAL_ERR_NOT_AVAILABLE:
 153         retval = "Not available";
 154         break;
 155     case OPAL_ERR_PERM:
 156         retval = "No permission";
 157         break;
 158     case OPAL_ERR_VALUE_OUT_OF_BOUNDS:
 159         retval = "Value out of bounds";
 160         break;
 161     case OPAL_ERR_FILE_READ_FAILURE:
 162         retval = "File read failure";
 163         break;
 164     case OPAL_ERR_FILE_WRITE_FAILURE:
 165         retval = "File write failure";
 166         break;
 167     case OPAL_ERR_FILE_OPEN_FAILURE:
 168         retval = "File open failure";
 169         break;
 170     case OPAL_ERR_PACK_MISMATCH:
 171         retval = "Pack data mismatch";
 172         break;
 173     case OPAL_ERR_PACK_FAILURE:
 174         retval = "Data pack failed";
 175         break;
 176     case OPAL_ERR_UNPACK_FAILURE:
 177         retval = "Data unpack failed";
 178         break;
 179     case OPAL_ERR_UNPACK_INADEQUATE_SPACE:
 180         retval = "Data unpack had inadequate space";
 181         break;
 182     case OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER:
 183         retval = "Data unpack would read past end of buffer";
 184         break;
 185     case OPAL_ERR_OPERATION_UNSUPPORTED:
 186         retval = "Requested operation is not supported on referenced data type";
 187         break;
 188     case OPAL_ERR_UNKNOWN_DATA_TYPE:
 189         retval = "Unknown data type";
 190         break;
 191     case OPAL_ERR_BUFFER:
 192         retval = "Buffer type (described vs non-described) mismatch - operation not allowed";
 193         break;
 194     case OPAL_ERR_DATA_TYPE_REDEF:
 195         retval = "Attempt to redefine an existing data type";
 196         break;
 197     case OPAL_ERR_DATA_OVERWRITE_ATTEMPT:
 198         retval = "Attempt to overwrite a data value";
 199         break;
 200     case OPAL_ERR_MODULE_NOT_FOUND:
 201         retval = "Framework requires at least one active module, but none found";
 202         break;
 203     case OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
 204         retval = "OS topology does not support slot_list process affinity";
 205         break;
 206     case OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED:
 207         retval = "Could not obtain socket topology information";
 208         break;
 209     case OPAL_ERR_TOPO_CORE_NOT_SUPPORTED:
 210         retval = "Could not obtain core topology information";
 211         break;
 212     case OPAL_ERR_NOT_ENOUGH_SOCKETS:
 213         retval = "Not enough sockets to meet request";
 214         break;
 215     case OPAL_ERR_NOT_ENOUGH_CORES:
 216         retval = "Not enough cores to meet request";
 217         break;
 218     case OPAL_ERR_INVALID_PHYS_CPU:
 219         retval = "Invalid physical cpu number returned";
 220         break;
 221     case OPAL_ERR_MULTIPLE_AFFINITIES:
 222         retval = "Multiple methods for assigning process affinity were specified";
 223         break;
 224     case OPAL_ERR_SLOT_LIST_RANGE:
 225         retval = "Provided slot_list range is invalid";
 226         break;
 227     case OPAL_ERR_NETWORK_NOT_PARSEABLE:
 228         retval = "Provided network specification is not parseable";
 229         break;
 230     case OPAL_ERR_SILENT:
 231         retval = NULL;
 232         break;
 233     case OPAL_ERR_NOT_INITIALIZED:
 234         retval = "Not initialized";
 235         break;
 236     case OPAL_ERR_NOT_BOUND:
 237         retval = "Not bound";
 238         break;
 239     case OPAL_ERR_TAKE_NEXT_OPTION:
 240         retval = "Take next option";
 241         break;
 242     case OPAL_ERR_PROC_ENTRY_NOT_FOUND:
 243         retval = "Database entry not found";
 244         break;
 245     case OPAL_ERR_DATA_VALUE_NOT_FOUND:
 246         retval = "Data for specified key not found";
 247         break;
 248     case OPAL_ERR_CONNECTION_FAILED:
 249         retval = "Connection failed";
 250         break;
 251     case OPAL_ERR_AUTHENTICATION_FAILED:
 252         retval = "Authentication failed";
 253         break;
 254     case OPAL_ERR_COMM_FAILURE:
 255         retval = "Comm failure";
 256         break;
 257     case OPAL_ERR_SERVER_NOT_AVAIL:
 258         retval = "Server not available";
 259         break;
 260     case OPAL_ERR_IN_PROCESS:
 261         retval = "Operation in process";
 262         break;
 263     case OPAL_ERR_DEBUGGER_RELEASE:
 264         retval = "Release debugger";
 265         break;
 266     case OPAL_ERR_HANDLERS_COMPLETE:
 267         retval = "Event handlers complete";
 268         break;
 269     case OPAL_ERR_PARTIAL_SUCCESS:
 270         retval = "Partial success";
 271         break;
 272     case OPAL_ERR_PROC_ABORTED:
 273         retval = "Process abnormally terminated";
 274         break;
 275     case OPAL_ERR_PROC_REQUESTED_ABORT:
 276         retval = "Process requested abort";
 277         break;
 278     case OPAL_ERR_PROC_ABORTING:
 279         retval = "Process is aborting";
 280         break;
 281     case OPAL_ERR_NODE_DOWN:
 282         retval = "Node has gone down";
 283         break;
 284     case OPAL_ERR_NODE_OFFLINE:
 285         retval = "Node has gone offline";
 286         break;
 287     case OPAL_ERR_JOB_TERMINATED:
 288         retval = "Job terminated";
 289         break;
 290     case OPAL_ERR_PROC_RESTART:
 291         retval = "Process restarted";
 292         break;
 293     case OPAL_ERR_PROC_CHECKPOINT:
 294         retval = "Process checkpoint";
 295         break;
 296     case OPAL_ERR_PROC_MIGRATE:
 297         retval = "Process migrate";
 298         break;
 299     case OPAL_ERR_EVENT_REGISTRATION:
 300         retval = "Event registration";
 301         break;
 302     case OPAL_ERR_HEARTBEAT_ALERT:
 303         retval = "Heartbeat not received";
 304         break;
 305     case OPAL_ERR_FILE_ALERT:
 306         retval = "File alert - proc may have stalled";
 307         break;
 308     case OPAL_ERR_MODEL_DECLARED:
 309         retval = "Model declared";
 310         break;
 311     case OPAL_PMIX_LAUNCH_DIRECTIVE:
 312         retval = "Launch directive";
 313         break;
 314 
 315     default:
 316         retval = "UNRECOGNIZED";
 317     }
 318 
 319     *errmsg = retval;
 320     return OPAL_SUCCESS;
 321 }
 322 
 323 
 324 int opal_init_psm(void)
 325 {
 326     /* Very early in the init sequence -- before *ANY* MCA components
 327        are opened -- we need to disable some behavior from the PSM and
 328        PSM2 libraries (by default): at least some old versions of
 329        these libraries hijack signal handlers during their library
 330        constructors and then do not un-hijack them when the libraries
 331        are unloaded.
 332 
 333        It is a bit of an abstraction break that we have to put
 334        vendor/transport-specific code in the OPAL core, but we're
 335        out of options, unfortunately.
 336 
 337        NOTE: We only disable this behavior if the corresponding
 338        environment variables are not already set (i.e., if the
 339        user/environment has indicated a preference for this behavior,
 340        we won't override it). */
 341     if (NULL == getenv("IPATH_NO_BACKTRACE")) {
 342         opal_setenv("IPATH_NO_BACKTRACE", "1", true, &environ);
 343     }
 344     if (NULL == getenv("HFI_NO_BACKTRACE")) {
 345         opal_setenv("HFI_NO_BACKTRACE", "1", true, &environ);
 346     }
 347 
 348     return OPAL_SUCCESS;
 349 }
 350 
 351 static int opal_init_error (const char *error, int ret)
 352 {
 353     if (OPAL_ERR_SILENT != ret) {
 354         opal_show_help( "help-opal-runtime.txt",
 355                         "opal_init:startup:internal-failure", true,
 356                         error, ret );
 357     }
 358     return ret;
 359 }
 360 
 361 static mca_base_framework_t *opal_init_util_frameworks[] = {
 362     &opal_installdirs_base_framework, &opal_if_base_framework, NULL,
 363 };
 364 
 365 int
 366 opal_init_util(int* pargc, char*** pargv)
 367 {
 368     int ret;
 369     char *error = NULL;
 370     char hostname[OPAL_MAXHOSTNAMELEN];
 371     OPAL_TIMING_ENV_INIT(otmng);
 372 
 373     if( ++opal_util_initialized != 1 ) {
 374         if( opal_util_initialized < 1 ) {
 375             return OPAL_ERROR;
 376         }
 377         return OPAL_SUCCESS;
 378     }
 379 
 380 
 381     OBJ_CONSTRUCT(&opal_init_util_domain, opal_finalize_domain_t);
 382     (void) opal_finalize_domain_init (&opal_init_util_domain, "opal_init_util");
 383     opal_finalize_set_domain (&opal_init_util_domain);
 384 
 385     opal_thread_set_main();
 386 
 387     opal_init_called = true;
 388 
 389     /* set the nodename right away so anyone who needs it has it. Note
 390      * that we don't bother with fqdn and prefix issues here - we let
 391      * the RTE later replace this with a modified name if the user
 392      * requests it */
 393     gethostname(hostname, sizeof(hostname));
 394     opal_process_info.nodename = strdup(hostname);
 395 
 396     /* initialize the memory allocator */
 397     opal_malloc_init();
 398 
 399     OPAL_TIMING_ENV_NEXT(otmng, "opal_malloc_init");
 400 
 401     /* initialize the output system */
 402     opal_output_init();
 403 
 404     /* initialize install dirs code */
 405     if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) {
 406         fprintf(stderr, "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
 407                 __FILE__, __LINE__, ret);
 408         return ret;
 409     }
 410 
 411     /* initialize the help system */
 412     opal_show_help_init();
 413 
 414     OPAL_TIMING_ENV_NEXT(otmng, "opal_show_help_init");
 415 
 416     /* register handler for errnum -> string converstion */
 417     if (OPAL_SUCCESS !=
 418         (ret = opal_error_register("OPAL",
 419                                    OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) {
 420         return opal_init_error ("opal_error_register", ret);
 421     }
 422 
 423     /* keyval lex-based parser */
 424     if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) {
 425         return opal_init_error ("opal_util_keyval_parse_init", ret);
 426     }
 427 
 428     // Disable PSM signal hijacking (see comment in function for more
 429     // details)
 430     opal_init_psm();
 431 
 432     OPAL_TIMING_ENV_NEXT(otmng, "opal_init_psm");
 433 
 434     /* Setup the parameter system */
 435     if (OPAL_SUCCESS != (ret = mca_base_var_init())) {
 436         return opal_init_error ("mca_base_var_init", ret);
 437     }
 438     OPAL_TIMING_ENV_NEXT(otmng, "opal_var_init");
 439 
 440     /* read any param files that were provided */
 441     if (OPAL_SUCCESS != (ret = mca_base_var_cache_files(false))) {
 442         return opal_init_error ("failed to cache files", ret);
 443     }
 444 
 445     OPAL_TIMING_ENV_NEXT(otmng, "opal_var_cache");
 446 
 447 
 448     /* register params for opal */
 449     if (OPAL_SUCCESS != (ret = opal_register_params())) {
 450         return opal_init_error ("opal_register_params", ret);
 451     }
 452 
 453     if (OPAL_SUCCESS != (ret = opal_net_init())) {
 454         return opal_init_error ("opal_net_init", ret);
 455     }
 456 
 457     OPAL_TIMING_ENV_NEXT(otmng, "opal_net_init");
 458 
 459     /* pretty-print stack handlers */
 460     if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) {
 461         return opal_init_error ("opal_util_register_stackhandlers", ret);
 462     }
 463 
 464     /* set system resource limits - internally protected against
 465      * doing so twice in cases where the launch agent did it for us
 466      */
 467     if (OPAL_SUCCESS != (ret = opal_util_init_sys_limits(&error))) {
 468         opal_show_help("help-opal-runtime.txt",
 469                         "opal_init:syslimit", false,
 470                         error);
 471         return OPAL_ERR_SILENT;
 472     }
 473 
 474     /* initialize the arch string */
 475     if (OPAL_SUCCESS != (ret = opal_arch_init ())) {
 476         return opal_init_error ("opal_arch_init", ret);
 477     }
 478 
 479     OPAL_TIMING_ENV_NEXT(otmng, "opal_arch_init");
 480 
 481     /* initialize the datatype engine */
 482     if (OPAL_SUCCESS != (ret = opal_datatype_init ())) {
 483         return opal_init_error ("opal_datatype_init", ret);
 484     }
 485 
 486     OPAL_TIMING_ENV_NEXT(otmng, "opal_datatype_init");
 487 
 488     /* Initialize the data storage service. */
 489     if (OPAL_SUCCESS != (ret = opal_dss_open())) {
 490         return opal_init_error ("opal_dss_open", ret);
 491     }
 492 
 493     OPAL_TIMING_ENV_NEXT(otmng, "opal_dss_open");
 494 
 495     /* initialize the mca */
 496     if (OPAL_SUCCESS != (ret = mca_base_open())) {
 497         return opal_init_error ("mca_base_open", ret);
 498     }
 499 
 500     OPAL_TIMING_ENV_NEXT(otmng, "mca_base_open");
 501 
 502     /* initialize if framework */
 503     if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_if_base_framework, 0))) {
 504         fprintf(stderr, "opal_if_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
 505                 __FILE__, __LINE__, ret);
 506         return ret;
 507     }
 508 
 509     /* register for */
 510     opal_finalize_register_cleanup_arg (mca_base_framework_close_list, opal_init_util_frameworks);
 511 
 512     OPAL_TIMING_ENV_NEXT(otmng, "opal_if_init");
 513 
 514     return OPAL_SUCCESS;
 515 }
 516 
 517 
 518 /* the memcpy component should be one of the first who get
 519  * loaded in order to make sure we have all the available
 520  * versions of memcpy correctly configured.
 521  */
 522 static mca_base_framework_t *opal_init_frameworks[] = {
 523     &opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework,
 524     &opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework,
 525     &opal_shmem_base_framework, &opal_reachable_base_framework, &opal_compress_base_framework,
 526     NULL,
 527 };
 528 
 529 int
 530 opal_init(int* pargc, char*** pargv)
 531 {
 532     int ret;
 533 
 534     if( ++opal_initialized != 1 ) {
 535         if( opal_initialized < 1 ) {
 536             return OPAL_ERROR;
 537         }
 538         return OPAL_SUCCESS;
 539     }
 540 
 541     /* initialize util code */
 542     if (OPAL_SUCCESS != (ret = opal_init_util(pargc, pargv))) {
 543         return ret;
 544     }
 545 
 546     OBJ_CONSTRUCT(&opal_init_domain, opal_finalize_domain_t);
 547     (void) opal_finalize_domain_init (&opal_init_domain, "opal_init");
 548     opal_finalize_set_domain (&opal_init_domain);
 549 
 550     opal_finalize_register_cleanup_arg (mca_base_framework_close_list, opal_init_frameworks);
 551     opal_finalize_register_cleanup (opal_tsd_keys_destruct);
 552 
 553     ret = mca_base_framework_open_list (opal_init_frameworks, 0);
 554     if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
 555         return opal_init_error ("opal_init framework open", ret);
 556     }
 557 
 558     /* initialize the memory manager / tracker */
 559     if (OPAL_SUCCESS != (ret = opal_mem_hooks_init())) {
 560         return opal_init_error ("opal_mem_hooks_init", ret);
 561     }
 562 
 563     /* select the memory checker */
 564     if (OPAL_SUCCESS != (ret = opal_memchecker_base_select())) {
 565         return opal_init_error ("opal_memchecker_base_select", ret);
 566     }
 567 
 568     /*
 569      * Initialize the general progress engine
 570      */
 571     if (OPAL_SUCCESS != (ret = opal_progress_init())) {
 572         return opal_init_error ("opal_progress_init", ret);
 573     }
 574     /* we want to tick the event library whenever possible */
 575     opal_progress_event_users_increment();
 576 
 577     /* setup the shmem framework */
 578     if (OPAL_SUCCESS != (ret = opal_shmem_base_select())) {
 579         return opal_init_error ("opal_shmem_base_select", ret);
 580     }
 581 
 582     /* Intitialize reachable framework */
 583     if (OPAL_SUCCESS != (ret = opal_reachable_base_select())) {
 584         return opal_init_error ("opal_reachable_base_select", ret);
 585     }
 586 
 587     /* Intitialize compress framework */
 588     if (OPAL_SUCCESS != (ret = opal_compress_base_select())) {
 589         return opal_init_error ("opal_compress_base_select", ret);
 590     }
 591 
 592     return OPAL_SUCCESS;
 593 }

/* [<][>][^][v][top][bottom][index][help] */