This source file includes following definitions.
- orte_oob_base_send_nb
- orte_oob_base_get_addr
- process_uri
- orte_oob_base_ft_event
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 #include "orte_config.h"
  15 #include "orte/constants.h"
  16 
  17 #include "opal/util/output.h"
  18 #include "opal/mca/pmix/pmix.h"
  19 #include "opal/util/argv.h"
  20 #include "opal/util/printf.h"
  21 
  22 #include "orte/mca/errmgr/errmgr.h"
  23 #include "orte/mca/state/state.h"
  24 #include "orte/mca/rml/rml.h"
  25 #include "orte/util/threads.h"
  26 #include "orte/mca/oob/base/base.h"
  27 #if OPAL_ENABLE_FT_CR == 1
  28 #include "orte/mca/state/base/base.h"
  29 #endif
  30 
  31 static void process_uri(char *uri);
  32 
  33 void orte_oob_base_send_nb(int fd, short args, void *cbdata)
  34 {
  35     orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
  36     orte_rml_send_t *msg;
  37     mca_base_component_list_item_t *cli;
  38     orte_oob_base_peer_t *pr;
  39     int rc;
  40     uint64_t ui64;
  41     bool msg_sent;
  42     mca_oob_base_component_t *component;
  43     bool reachable;
  44     char *uri;
  45 
  46     ORTE_ACQUIRE_OBJECT(cd);
  47 
  48     
  49     msg = cd->msg;
  50     OBJ_RELEASE(cd);
  51 
  52     opal_output_verbose(5, orte_oob_base_framework.framework_output,
  53                         "%s oob:base:send to target %s - attempt %u",
  54                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  55                         ORTE_NAME_PRINT(&msg->dst), msg->retries);
  56 
  57     
  58 
  59 
  60     if (orte_rml_base.max_retries <= msg->retries) {
  61         msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
  62         ORTE_RML_SEND_COMPLETE(msg);
  63         return;
  64     }
  65 
  66     
  67     memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
  68     if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
  69                                                          ui64, (void**)&pr) ||
  70         NULL == pr) {
  71         opal_output_verbose(5, orte_oob_base_framework.framework_output,
  72                             "%s oob:base:send unknown peer %s",
  73                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  74                             ORTE_NAME_PRINT(&msg->dst));
  75         
  76 
  77 
  78 
  79 
  80         OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PROC_URI, &msg->dst,
  81                                       (char**)&uri, OPAL_STRING);
  82         if (OPAL_SUCCESS == rc ) {
  83             if (NULL != uri) {
  84                 process_uri(uri);
  85                 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
  86                                                                      ui64, (void**)&pr) ||
  87                     NULL == pr) {
  88                     
  89                     ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
  90                     msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
  91                     ORTE_RML_SEND_COMPLETE(msg);
  92                     return;
  93                 }
  94             } else {
  95                 ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
  96                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
  97                 ORTE_RML_SEND_COMPLETE(msg);
  98                 return;
  99             }
 100         } else {
 101             
 102 
 103 
 104 
 105             reachable = false;
 106             pr = NULL;
 107             OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 108                 component = (mca_oob_base_component_t*)cli->cli_component;
 109                 if (NULL != component->is_reachable) {
 110                     if (component->is_reachable(&msg->dst)) {
 111                         
 112 
 113 
 114                         if (NULL == pr) {
 115                             pr = OBJ_NEW(orte_oob_base_peer_t);
 116                             if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
 117                                 ORTE_ERROR_LOG(rc);
 118                                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
 119                                 ORTE_RML_SEND_COMPLETE(msg);
 120                                 return;
 121                             }
 122                         }
 123                         
 124                         opal_bitmap_set_bit(&pr->addressable, component->idx);
 125                         
 126                         reachable = true;
 127                     }
 128                 }
 129             }
 130             
 131             if (!reachable) {
 132                 
 133 
 134 
 135                 if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
 136                     ++msg->retries;
 137                     if (msg->retries < orte_rml_base.max_retries) {
 138                         ORTE_OOB_SEND(msg);
 139                         return;
 140                     }
 141                 }
 142                 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
 143                 ORTE_RML_SEND_COMPLETE(msg);
 144                 return;
 145             }
 146         }
 147     }
 148 
 149 
 150     
 151     if (NULL != pr->component) {
 152         
 153 
 154 
 155         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 156                             "%s oob:base:send known transport for peer %s",
 157                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 158                             ORTE_NAME_PRINT(&msg->dst));
 159         if (ORTE_SUCCESS == (rc = pr->component->send_nb(msg))) {
 160             return;
 161         }
 162     }
 163 
 164     
 165 
 166 
 167 
 168 
 169     msg_sent = false;
 170     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 171         component = (mca_oob_base_component_t*)cli->cli_component;
 172         
 173         if (!component->is_reachable(&msg->dst)) {
 174             continue;
 175         }
 176         
 177         if (ORTE_SUCCESS == (rc = component->send_nb(msg))) {
 178             
 179             msg_sent = true;
 180             
 181             pr->component = component;
 182             break;
 183         } else if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
 184             
 185 
 186 
 187             ORTE_ERROR_LOG(rc);
 188             msg->status = rc;
 189             ORTE_RML_SEND_COMPLETE(msg);
 190             return;
 191         }
 192     }
 193 
 194     
 195 
 196 
 197     if (!msg_sent) {
 198         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 199                             "%s oob:base:send no path to target %s",
 200                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 201                             ORTE_NAME_PRINT(&msg->dst));
 202         msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
 203         ORTE_RML_SEND_COMPLETE(msg);
 204     }
 205 }
 206 
 207 
 208 
 209 
 210 
 211 
 212 
 213 
 214 
 215 
 216 
 217 
 218 
 219 
 220 
 221 
 222 
 223 void orte_oob_base_get_addr(char **uri)
 224 {
 225     char *turi, *final=NULL, *tmp;
 226     size_t len = 0;
 227     int rc=ORTE_SUCCESS;
 228     bool one_added = false;
 229     mca_base_component_list_item_t *cli;
 230     mca_oob_base_component_t *component;
 231     opal_value_t val;
 232 
 233     
 234     if (ORTE_SUCCESS != (rc = orte_util_convert_process_name_to_string(&final, ORTE_PROC_MY_NAME))) {
 235         ORTE_ERROR_LOG(rc);
 236         *uri = NULL;
 237         return;
 238     }
 239     len = strlen(final);
 240 
 241     
 242 
 243 
 244     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 245         component = (mca_oob_base_component_t*)cli->cli_component;
 246         
 247 
 248 
 249         if (NULL == component->get_addr) {
 250             
 251             continue;
 252         }
 253         
 254 
 255 
 256 
 257 
 258         turi = component->get_addr();
 259         if (NULL != turi) {
 260             
 261             if (0 < orte_oob_base.max_uri_length &&
 262                 orte_oob_base.max_uri_length < (int)(len + strlen(turi))) {
 263                 
 264                 continue;
 265             }
 266             
 267             opal_asprintf(&tmp, "%s;%s", final, turi);
 268             free(turi);
 269             free(final);
 270             final = tmp;
 271             len = strlen(final);
 272             
 273             one_added = true;
 274         }
 275     }
 276 
 277     if (!one_added) {
 278         
 279         if (NULL != final) {
 280             free(final);
 281             final = NULL;
 282         }
 283     }
 284 
 285     *uri = final;
 286     
 287     OBJ_CONSTRUCT(&val, opal_value_t);
 288     val.key = OPAL_PMIX_PROC_URI;
 289     val.type = OPAL_STRING;
 290     val.data.string = final;
 291     if (OPAL_SUCCESS != (rc = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
 292         ORTE_ERROR_LOG(rc);
 293     }
 294     val.key = NULL;
 295     val.data.string = NULL;
 296     OBJ_DESTRUCT(&val);
 297 }
 298 
 299 static void process_uri(char *uri)
 300 {
 301     orte_process_name_t peer;
 302     char *cptr;
 303     mca_base_component_list_item_t *cli;
 304     mca_oob_base_component_t *component;
 305     char **uris=NULL;
 306     int rc;
 307     uint64_t ui64;
 308     orte_oob_base_peer_t *pr;
 309 
 310     
 311     cptr = strchr(uri, ';');
 312     if (NULL == cptr) {
 313         
 314 
 315 
 316 
 317         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 318         return;
 319     }
 320     *cptr = '\0';
 321     cptr++;
 322 
 323     
 324     orte_util_convert_string_to_process_name(&peer, uri);
 325 
 326     
 327 
 328 
 329     if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
 330         peer.vpid == ORTE_PROC_MY_NAME->vpid) {
 331         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 332                             "%s:set_addr peer %s is me",
 333                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 334                             ORTE_NAME_PRINT(&peer));
 335         return;
 336     }
 337 
 338     
 339     uris = opal_argv_split(cptr, ';');
 340 
 341     
 342     memcpy(&ui64, (char*)&peer, sizeof(uint64_t));
 343     if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
 344                                                          ui64, (void**)&pr) ||
 345         NULL == pr) {
 346         pr = OBJ_NEW(orte_oob_base_peer_t);
 347         if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
 348             ORTE_ERROR_LOG(rc);
 349             opal_argv_free(uris);
 350             return;
 351         }
 352     }
 353 
 354     
 355 
 356 
 357 
 358 
 359     rc = ORTE_ERR_UNREACH;
 360     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 361         component = (mca_oob_base_component_t*)cli->cli_component;
 362         opal_output_verbose(5, orte_oob_base_framework.framework_output,
 363                             "%s:set_addr checking if peer %s is reachable via component %s",
 364                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 365                             ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 366         if (NULL != component->set_addr) {
 367             if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
 368                 
 369 
 370 
 371                 opal_output_verbose(5, orte_oob_base_framework.framework_output,
 372                                     "%s: peer %s is reachable via component %s",
 373                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 374                                     ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 375                 opal_bitmap_set_bit(&pr->addressable, component->idx);
 376             } else {
 377                 opal_output_verbose(5, orte_oob_base_framework.framework_output,
 378                                     "%s: peer %s is NOT reachable via component %s",
 379                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 380                                     ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
 381             }
 382         }
 383     }
 384     opal_argv_free(uris);
 385 }
 386 
 387 #if OPAL_ENABLE_FT_CR == 1
 388 void orte_oob_base_ft_event(int sd, short argc, void *cbdata)
 389 {
 390     int rc;
 391     mca_base_component_list_item_t *cli;
 392     mca_oob_base_component_t *component;
 393     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 394 
 395     opal_output_verbose(5, orte_oob_base_framework.framework_output,
 396                         "%s oob:base:ft_event %s(%d)",
 397                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 398                         orte_job_state_to_str(state->job_state),
 399                         state->job_state);
 400 
 401     
 402 
 403 
 404     OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
 405         component = (mca_oob_base_component_t*)cli->cli_component;
 406         if (NULL == component->ft_event) {
 407             
 408             continue;
 409         }
 410 
 411         if (ORTE_SUCCESS != (rc = component->ft_event(state->job_state))) {
 412             ORTE_ERROR_LOG(rc);
 413         }
 414     }
 415     OBJ_RELEASE(state);
 416 }
 417 
 418 #endif