root/orte/mca/oob/base/oob_base_stubs.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_oob_base_send_nb
  2. orte_oob_base_get_addr
  3. process_uri
  4. orte_oob_base_ft_event

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
   4  *                         reserved.
   5  * Copyright (c) 2013-2019 Intel, Inc.  All rights reserved.
   6  * $COPYRIGHT$
   7  *
   8  * Additional copyrights may follow
   9  *
  10  * $HEADER$
  11  */
  12 
  13 
  14 #include "orte_config.h"
  15 #include "orte/constants.h"
  16 
  17 #include "opal/util/output.h"
  18 #include "opal/mca/pmix/pmix.h"
  19 #include "opal/util/argv.h"
  20 #include "opal/util/printf.h"
  21 
  22 #include "orte/mca/errmgr/errmgr.h"
  23 #include "orte/mca/state/state.h"
  24 #include "orte/mca/rml/rml.h"
  25 #include "orte/util/threads.h"
  26 #include "orte/mca/oob/base/base.h"
  27 #if OPAL_ENABLE_FT_CR == 1
  28 #include "orte/mca/state/base/base.h"
  29 #endif
  30 
  31 static void process_uri(char *uri);
  32 
  33 void orte_oob_base_send_nb(int fd, short args, void *cbdata)
  34 {
  35     orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
  36     orte_rml_send_t *msg;
  37     mca_base_component_list_item_t *cli;
  38     orte_oob_base_peer_t *pr;
  39     int rc;
  40     uint64_t ui64;
  41     bool msg_sent;
  42     mca_oob_base_component_t *component;
  43     bool reachable;
  44     char *uri;
  45 
  46     ORTE_ACQUIRE_OBJECT(cd);
  47 
  48     /* done with this. release it now */
  49     msg = cd->msg;
  50     OBJ_RELEASE(cd);
  51 
  52     opal_output_verbose(5, orte_oob_base_framework.framework_output,
  53                         "%s oob:base:send to target %s - attempt %u",
  54                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  55                         ORTE_NAME_PRINT(&msg->dst), msg->retries);
  56 
  57     /* don't try forever - if we have exceeded the number of retries,
  58      * then report this message as undeliverable even if someone continues
  59      * to think they could reach it */
  60     if (orte_rml_base.max_retries <= msg->retries) {
  61         msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
  62         ORTE_RML_SEND_COMPLETE(msg);
  63         return;
  64     }
  65 
  66     /* check if we have this peer in our hash table */
  67     memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
  68     if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
  69                                                          ui64, (void**)&pr) ||
  70         NULL == pr) {
  71         opal_output_verbose(5, orte_oob_base_framework.framework_output,
  72                             "%s oob:base:send unknown peer %s",
  73                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  74                             ORTE_NAME_PRINT(&msg->dst));
  75         /* for direct launched procs, the URI might be in the database,
  76          * so check there next - if it is, the peer object will be added
  77          * to our hash table. However, we don't want to chase up to the
  78          * server after it, so indicate it is optional
  79          */
  80         OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PROC_URI, &msg->dst,
  81                                       (char**)&uri, OPAL_STRING);
  82         if (OPAL_SUCCESS == rc ) {
  83             if (NULL != uri) {
  84                 process_uri(uri);
  85                 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
  86                                                                      ui64, (void**)&pr) ||
  87                     NULL == pr) {
  88                     /* that is just plain wrong */
  89                     ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
  90                     msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
  91                     ORTE_RML_SEND_COMPLETE(msg);
  92                     return;
  93                 }
  94             } else {
  95                 ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
  96                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
  97                 ORTE_RML_SEND_COMPLETE(msg);
  98                 return;
  99             }
 100         } else {
 101             /* even though we don't know about this peer yet, we still might
 102              * be able to get to it via routing, so ask each component if
 103              * it can reach it
 104              */
 105             reachable = false;
 106             pr = NULL;
 107             OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 108                 component = (mca_oob_base_component_t*)cli->cli_component;
 109                 if (NULL != component->is_reachable) {
 110                     if (component->is_reachable(&msg->dst)) {
 111                         /* there is a way to reach this peer - record it
 112                          * so we don't waste this time again
 113                          */
 114                         if (NULL == pr) {
 115                             pr = OBJ_NEW(orte_oob_base_peer_t);
 116                             if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
 117                                 ORTE_ERROR_LOG(rc);
 118                                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
 119                                 ORTE_RML_SEND_COMPLETE(msg);
 120                                 return;
 121                             }
 122                         }
 123                         /* mark that this component can reach the peer */
 124                         opal_bitmap_set_bit(&pr->addressable, component->idx);
 125                         /* flag that at least one component can reach this peer */
 126                         reachable = true;
 127                     }
 128                 }
 129             }
 130             /* if nobody could reach it, then that's an error */
 131             if (!reachable) {
 132                 /* if we are a daemon or HNP, then it could be that
 133                  * this is a local proc we just haven't heard from
 134                  * yet due to a race condition. Check that situation */
 135                 if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
 136                     ++msg->retries;
 137                     if (msg->retries < orte_rml_base.max_retries) {
 138                         ORTE_OOB_SEND(msg);
 139                         return;
 140                     }
 141                 }
 142                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
 143                 ORTE_RML_SEND_COMPLETE(msg);
 144                 return;
 145             }
 146         }
 147     }
 148 
 149 
 150     /* if we already have a connection to this peer, use it */
 151     if (NULL != pr->component) {
 152         /* post this msg for send by this transport - the component
 153          * runs on our event base, so we can just call their function
 154          */
 155         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 156                             "%s oob:base:send known transport for peer %s",
 157                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 158                             ORTE_NAME_PRINT(&msg->dst));
 159         if (ORTE_SUCCESS == (rc = pr->component->send_nb(msg))) {
 160             return;
 161         }
 162     }
 163 
 164     /* if we haven't identified a transport to this peer,
 165      * loop across all available components in priority order until
 166      * one replies that it has a module that can reach this peer.
 167      * Let it try to make the connection
 168      */
 169     msg_sent = false;
 170     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 171         component = (mca_oob_base_component_t*)cli->cli_component;
 172         /* is this peer reachable via this component? */
 173         if (!component->is_reachable(&msg->dst)) {
 174             continue;
 175         }
 176         /* it is addressable, so attempt to send via that transport */
 177         if (ORTE_SUCCESS == (rc = component->send_nb(msg))) {
 178             /* the msg status will be set upon send completion/failure */
 179             msg_sent = true;
 180             /* point to this transport for any future messages */
 181             pr->component = component;
 182             break;
 183         } else if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
 184             /* components return "next option" if they can't connect
 185              * to this peer. anything else is a true error.
 186              */
 187             ORTE_ERROR_LOG(rc);
 188             msg->status = rc;
 189             ORTE_RML_SEND_COMPLETE(msg);
 190             return;
 191         }
 192     }
 193 
 194     /* if no component can reach this peer, that's an error - post
 195      * it back to the RML for handling
 196      */
 197     if (!msg_sent) {
 198         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 199                             "%s oob:base:send no path to target %s",
 200                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 201                             ORTE_NAME_PRINT(&msg->dst));
 202         msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
 203         ORTE_RML_SEND_COMPLETE(msg);
 204     }
 205 }
 206 
 207 /**
 208  * Obtain a uri for initial connection purposes
 209  *
 210  * During initial wireup, we can only transfer contact info on the daemon
 211  * command line. This limits what we can send to a string representation of
 212  * the actual contact info, which gets sent in a uri-like form. Not every
 213  * oob module can support this transaction, so this function will loop
 214  * across all oob components/modules, letting each add to the uri string if
 215  * it supports bootstrap operations. An error will be returned in the cbfunc
 216  * if NO component can successfully provide a contact.
 217  *
 218  * Note: since there is a limit to what an OS will allow on a cmd line, we
 219  * impose a limit on the length of the resulting uri via an MCA param. The
 220  * default value of -1 implies unlimited - however, users with large numbers
 221  * of interfaces on their nodes may wish to restrict the size.
 222  */
 223 void orte_oob_base_get_addr(char **uri)
 224 {
 225     char *turi, *final=NULL, *tmp;
 226     size_t len = 0;
 227     int rc=ORTE_SUCCESS;
 228     bool one_added = false;
 229     mca_base_component_list_item_t *cli;
 230     mca_oob_base_component_t *component;
 231     opal_value_t val;
 232 
 233     /* start with our process name */
 234     if (ORTE_SUCCESS != (rc = orte_util_convert_process_name_to_string(&final, ORTE_PROC_MY_NAME))) {
 235         ORTE_ERROR_LOG(rc);
 236         *uri = NULL;
 237         return;
 238     }
 239     len = strlen(final);
 240 
 241     /* loop across all available modules to get their input
 242      * up to the max length
 243      */
 244     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 245         component = (mca_oob_base_component_t*)cli->cli_component;
 246         /* ask the component for its input, obtained when it
 247          * opened its modules
 248          */
 249         if (NULL == component->get_addr) {
 250             /* doesn't support this ability */
 251             continue;
 252         }
 253         /* the components operate within our event base, so we
 254          * can directly call their get_uri function to get the
 255          * pointer to the uri - this is not a copy, so
 256          * do NOT free it!
 257          */
 258         turi = component->get_addr();
 259         if (NULL != turi) {
 260             /* check overall length for limits */
 261             if (0 < orte_oob_base.max_uri_length &&
 262                 orte_oob_base.max_uri_length < (int)(len + strlen(turi))) {
 263                 /* cannot accept the payload */
 264                 continue;
 265             }
 266             /* add new value to final one */
 267             opal_asprintf(&tmp, "%s;%s", final, turi);
 268             free(turi);
 269             free(final);
 270             final = tmp;
 271             len = strlen(final);
 272             /* flag that at least one contributed */
 273             one_added = true;
 274         }
 275     }
 276 
 277     if (!one_added) {
 278         /* nobody could contribute */
 279         if (NULL != final) {
 280             free(final);
 281             final = NULL;
 282         }
 283     }
 284 
 285     *uri = final;
 286     /* push this into our modex storage */
 287     OBJ_CONSTRUCT(&val, opal_value_t);
 288     val.key = OPAL_PMIX_PROC_URI;
 289     val.type = OPAL_STRING;
 290     val.data.string = final;
 291     if (OPAL_SUCCESS != (rc = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
 292         ORTE_ERROR_LOG(rc);
 293     }
 294     val.key = NULL;
 295     val.data.string = NULL;
 296     OBJ_DESTRUCT(&val);
 297 }
 298 
 299 static void process_uri(char *uri)
 300 {
 301     orte_process_name_t peer;
 302     char *cptr;
 303     mca_base_component_list_item_t *cli;
 304     mca_oob_base_component_t *component;
 305     char **uris=NULL;
 306     int rc;
 307     uint64_t ui64;
 308     orte_oob_base_peer_t *pr;
 309 
 310     /* find the first semi-colon in the string */
 311     cptr = strchr(uri, ';');
 312     if (NULL == cptr) {
 313         /* got a problem - there must be at least two fields,
 314          * the first containing the process name of our peer
 315          * and all others containing the OOB contact info
 316          */
 317         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 318         return;
 319     }
 320     *cptr = '\0';
 321     cptr++;
 322 
 323     /* the first field is the process name, so convert it */
 324     orte_util_convert_string_to_process_name(&peer, uri);
 325 
 326     /* if the peer is us, no need to go further as we already
 327      * know our own contact info
 328      */
 329     if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
 330         peer.vpid == ORTE_PROC_MY_NAME->vpid) {
 331         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 332                             "%s:set_addr peer %s is me",
 333                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 334                             ORTE_NAME_PRINT(&peer));
 335         return;
 336     }
 337 
 338     /* split the rest of the uri into component parts */
 339     uris = opal_argv_split(cptr, ';');
 340 
 341     /* get the peer object for this process */
 342     memcpy(&ui64, (char*)&peer, sizeof(uint64_t));
 343     if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
 344                                                          ui64, (void**)&pr) ||
 345         NULL == pr) {
 346         pr = OBJ_NEW(orte_oob_base_peer_t);
 347         if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
 348             ORTE_ERROR_LOG(rc);
 349             opal_argv_free(uris);
 350             return;
 351         }
 352     }
 353 
 354     /* loop across all available components and let them extract
 355      * whatever piece(s) of the uri they find relevant - they
 356      * are all operating on our event base, so we can just
 357      * directly call their functions
 358      */
 359     rc = ORTE_ERR_UNREACH;
 360     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 361         component = (mca_oob_base_component_t*)cli->cli_component;
 362         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 363                             "%s:set_addr checking if peer %s is reachable via component %s",
 364                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 365                             ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 366         if (NULL != component->set_addr) {
 367             if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
 368                 /* this component found reachable addresses
 369                  * in the uris
 370                  */
 371                 opal_output_verbose(5, orte_oob_base_framework.framework_output,
 372                                     "%s: peer %s is reachable via component %s",
 373                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 374                                     ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 375                 opal_bitmap_set_bit(&pr->addressable, component->idx);
 376             } else {
 377                 opal_output_verbose(5, orte_oob_base_framework.framework_output,
 378                                     "%s: peer %s is NOT reachable via component %s",
 379                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 380                                     ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 381             }
 382         }
 383     }
 384     opal_argv_free(uris);
 385 }
 386 
 387 #if OPAL_ENABLE_FT_CR == 1
 388 void orte_oob_base_ft_event(int sd, short argc, void *cbdata)
 389 {
 390     int rc;
 391     mca_base_component_list_item_t *cli;
 392     mca_oob_base_component_t *component;
 393     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 394 
 395     opal_output_verbose(5, orte_oob_base_framework.framework_output,
 396                         "%s oob:base:ft_event %s(%d)",
 397                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 398                         orte_job_state_to_str(state->job_state),
 399                         state->job_state);
 400 
 401     /* loop across all available modules in priority order
 402      * and call each one's ft_event handler
 403      */
 404     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 405         component = (mca_oob_base_component_t*)cli->cli_component;
 406         if (NULL == component->ft_event) {
 407             /* doesn't support this ability */
 408             continue;
 409         }
 410 
 411         if (ORTE_SUCCESS != (rc = component->ft_event(state->job_state))) {
 412             ORTE_ERROR_LOG(rc);
 413         }
 414     }
 415     OBJ_RELEASE(state);
 416 }
 417 
 418 #endif

/* [<][>][^][v][top][bottom][index][help] */