root/ompi/mca/rte/orte/rte_orte_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_rte_abort
  2. _release_fn
  3. _register_fn
  4. ompi_rte_wait_for_debugger
  5. ompi_rte_connect_accept_support

   1 /*
   2  * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
   3  *                         All rights reserved.
   4  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
   5  * Copyright (c) 2012-2014 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2014      Cisco Systems, Inc.  All rights reserved.
   9  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  10  * $COPYRIGHT$
  11  */
  12 #include "ompi_config.h"
  13 #include "ompi/constants.h"
  14 
  15 #include <string.h>
  16 #include <stdio.h>
  17 #include <ctype.h>
  18 
  19 #include "opal/dss/dss.h"
  20 #include "opal/util/argv.h"
  21 #include "opal/util/proc.h"
  22 #include "opal/util/opal_getcwd.h"
  23 #include "opal/util/printf.h"
  24 #include "opal/mca/pmix/pmix.h"
  25 #include "opal/threads/threads.h"
  26 #include "opal/class/opal_list.h"
  27 #include "opal/dss/dss.h"
  28 
  29 #include "orte/mca/errmgr/errmgr.h"
  30 #include "orte/mca/ess/ess.h"
  31 #include "orte/mca/grpcomm/base/base.h"
  32 #include "orte/mca/odls/odls.h"
  33 #include "orte/mca/plm/plm.h"
  34 #include "orte/mca/rml/rml.h"
  35 #include "orte/mca/rml/rml_types.h"
  36 #include "orte/mca/rmaps/rmaps.h"
  37 #include "orte/mca/rmaps/rmaps_types.h"
  38 #include "orte/mca/rmaps/base/base.h"
  39 #include "orte/mca/rml/base/rml_contact.h"
  40 #include "orte/mca/state/state.h"
  41 #include "orte/mca/routed/routed.h"
  42 #include "orte/util/name_fns.h"
  43 #include "orte/util/session_dir.h"
  44 #include "orte/util/show_help.h"
  45 #include "orte/runtime/orte_globals.h"
  46 #include "orte/runtime/orte_wait.h"
  47 #include "orte/runtime/orte_data_server.h"
  48 
  49 #include "ompi/mca/rte/base/base.h"
  50 #include "ompi/mca/rte/rte.h"
  51 #include "ompi/debuggers/debuggers.h"
  52 #include "ompi/proc/proc.h"
  53 #include "ompi/runtime/params.h"
  54 #include "ompi/communicator/communicator.h"
  55 
  56 extern ompi_rte_component_t mca_rte_orte_component;
  57 
  58 void ompi_rte_abort(int error_code, char *fmt, ...)
  59 {
  60     va_list arglist;
  61 
  62     /* If there was a message, output it */
  63     va_start(arglist, fmt);
  64     if( NULL != fmt ) {
  65         char* buffer = NULL;
  66         opal_vasprintf( &buffer, fmt, arglist );
  67         opal_output( 0, "%s", buffer );
  68         free( buffer );
  69     }
  70     va_end(arglist);
  71 
  72     /* if I am a daemon or the HNP... */
  73     if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
  74         /* whack my local procs */
  75         orte_odls.kill_local_procs(NULL);
  76         /* whack any session directories */
  77         orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
  78     } else {
  79         /* cleanup my session directory */
  80         orte_session_dir_finalize(ORTE_PROC_MY_NAME);
  81     }
  82 
  83     /* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core */
  84     if (ORTE_ERR_CONNECTION_FAILED == error_code ||
  85         ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
  86         orte_ess.abort(error_code, false);
  87     } else {
  88         orte_ess.abort(error_code, true);
  89     }
  90 
  91     /*
  92      * We must exit in orte_ess.abort; all implementations of orte_ess.abort
  93      * contain __opal_attribute_noreturn__
  94      */
  95     /* No way to reach here, but put an exit() here a) just to cover
  96        for bugs, and b) to let the compiler know we're honoring the
  97        __opal_attribute_noreturn__. */
  98     exit(-1);
  99 }
 100 
 101 static size_t handler = SIZE_MAX;
 102 static bool debugger_register_active = true;
 103 static bool debugger_event_active = true;
 104 
 105 static void _release_fn(int status,
 106                         const opal_process_name_t *source,
 107                         opal_list_t *info, opal_list_t *results,
 108                         opal_pmix_notification_complete_fn_t cbfunc,
 109                         void *cbdata)
 110 {
 111     /* must let the notifier know we are done */
 112     if (NULL != cbfunc) {
 113         cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
 114     }
 115     debugger_event_active = false;
 116 }
 117 
 118 static void _register_fn(int status,
 119                          size_t evhandler_ref,
 120                          void *cbdata)
 121 {
 122     opal_list_t *codes = (opal_list_t*)cbdata;
 123 
 124     handler = evhandler_ref;
 125     OPAL_LIST_RELEASE(codes);
 126     debugger_register_active = false;
 127 }
 128 
 129 /*
 130  * Wait for a debugger if asked.  We support two ways of waiting for
 131  * attaching debuggers -- see big comment in
 132  * orte/tools/orterun/debuggers.c explaining the two scenarios.
 133  */
 134 void ompi_rte_wait_for_debugger(void)
 135 {
 136     int debugger;
 137     opal_list_t *codes, directives;
 138     opal_value_t *kv;
 139     char *evar;
 140     int time;
 141 
 142     /* See lengthy comment in orte/tools/orterun/debuggers.c about
 143        orte_in_parallel_debugger */
 144     debugger = orte_in_parallel_debugger;
 145 
 146     if (1 == MPIR_being_debugged) {
 147         debugger = 1;
 148     }
 149 
 150     if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
 151         /* if not, just return */
 152         return;
 153     }
 154 
 155     /* if we are being debugged, then we need to find
 156      * the correct plug-ins
 157      */
 158     ompi_debugger_setup_dlls();
 159 
 160     if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
 161         time = strtol(evar, NULL, 10);
 162         sleep(time);
 163         return;
 164     }
 165 
 166     if (orte_standalone_operation) {
 167         /* spin until debugger attaches and releases us */
 168         while (MPIR_debug_gate == 0) {
 169 #if defined(HAVE_USLEEP)
 170             usleep(100000); /* microseconds */
 171 #else
 172             sleep(1);       /* seconds */
 173 #endif
 174         }
 175     } else {
 176 
 177         /* register an event handler for the ORTE_ERR_DEBUGGER_RELEASE event */
 178         codes = OBJ_NEW(opal_list_t);
 179         kv = OBJ_NEW(opal_value_t);
 180         kv->key = strdup("errorcode");
 181         kv->type = OPAL_INT;
 182         kv->data.integer = ORTE_ERR_DEBUGGER_RELEASE;
 183         opal_list_append(codes, &kv->super);
 184 
 185         OBJ_CONSTRUCT(&directives, opal_list_t);
 186         kv = OBJ_NEW(opal_value_t);
 187         kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
 188         kv->type = OPAL_STRING;
 189         kv->data.string = strdup("MPI-DEBUGGER-ATTACH");
 190         opal_list_append(&directives, &kv->super);
 191 
 192         opal_pmix.register_evhandler(codes, &directives, _release_fn, _register_fn, codes);
 193         /* let the MPI progress engine run while we wait for registration to complete */
 194         OMPI_WAIT_FOR_COMPLETION(debugger_register_active);
 195         OPAL_LIST_DESTRUCT(&directives);
 196 
 197         /* let the MPI progress engine run while we wait for debugger release */
 198         OMPI_WAIT_FOR_COMPLETION(debugger_event_active);
 199 
 200         /* deregister the event handler */
 201         opal_pmix.deregister_evhandler(handler, NULL, NULL);
 202     }
 203 }
 204 
 205 bool ompi_rte_connect_accept_support(const char *port)
 206 {
 207     char *ptr, *tmp;
 208     orte_process_name_t name;
 209 
 210     /* were we launched by mpirun, or are we calling
 211      * without a defined port? */
 212     if (NULL == orte_process_info.my_hnp_uri ||
 213         NULL == port || 0 == strlen(port)) {
 214         return true;
 215     }
 216 
 217     /* is the job family in the port different than my own? */
 218     tmp = strdup(port);  // protect input
 219     if (NULL == (ptr = strchr(tmp, ':'))) {
 220         /* this port didn't come from us! */
 221         orte_show_help("help-orterun.txt", "orterun:malformedport", true);
 222         free(tmp);
 223         return false;
 224     }
 225     *ptr = '\0';
 226     if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
 227         free(tmp);
 228         orte_show_help("help-orterun.txt", "orterun:malformedport", true);
 229         return false;
 230     }
 231     free(tmp);
 232     if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
 233         /* same job family, so our infrastructure is adequate */
 234         return true;
 235     }
 236 
 237     /* if the job family of the port is different than our own
 238      * and we were launched by mpirun, then we require ompi-server
 239      * support */
 240     if (NULL == orte_data_server_uri) {
 241         /* print a pretty help message */
 242         orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
 243         return false;
 244     }
 245 
 246     return true;
 247 }

/* [<][>][^][v][top][bottom][index][help] */