root/opal/mca/btl/usnic/btl_usnic_proc.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. proc_construct
  2. proc_destruct
  3. opal_btl_usnic_proc_lookup_ompi
  4. opal_btl_usnic_proc_lookup_endpoint
  5. create_proc
  6. compute_weight
  7. edge_pairs_to_match_table
  8. create_proc_module_graph
  9. match_modex
  10. start_av_insert
  11. opal_btl_usnic_create_endpoint
  12. opal_btl_usnic_proc_match

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2006      Sandia National Laboratories. All rights
  13  *                         reserved.
  14  * Copyright (c) 2013-2016 Cisco Systems, Inc.  All rights reserved.
  15  * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  */
  22 #include <netinet/in.h>
  23 
  24 #include "opal_config.h"
  25 
  26 #include "opal_stdint.h"
  27 #include "opal/util/arch.h"
  28 #include "opal/util/show_help.h"
  29 #include "opal/constants.h"
  30 #include "opal/util/bipartite_graph.h"
  31 #include "opal/util/string_copy.h"
  32 
  33 #include "btl_usnic_compat.h"
  34 #include "btl_usnic.h"
  35 #include "btl_usnic_proc.h"
  36 #include "btl_usnic_endpoint.h"
  37 #include "btl_usnic_module.h"
  38 #include "btl_usnic_util.h"
  39 
  40 /* larger weight values are more desirable (i.e., worth, not cost) */
  41 enum {
  42     WEIGHT_UNREACHABLE = -1
  43 };
  44 
  45 /* Helper macros for "match_modex" and friends for translating between array
  46  * indices and vertex IDs.  Module vertices always come first in the graph,
  47  * followed by proc (endpoint) vertices. */
  48 #define PROC_VERTEX(modex_idx) (mca_btl_usnic_component.num_modules + modex_idx)
  49 #define MODULE_VERTEX(module_idx) (module_idx)
  50 #define PROC_INDEX(proc_vertex) ((proc_vertex) - mca_btl_usnic_component.num_modules)
  51 #define MODULE_INDEX(module_vertex) (module_vertex)
  52 
  53 static void proc_construct(opal_btl_usnic_proc_t* proc)
  54 {
  55     proc->proc_opal = 0;
  56     proc->proc_modex = NULL;
  57     proc->proc_modex_count = 0;
  58     proc->proc_modex_claimed = NULL;
  59     proc->proc_endpoints = NULL;
  60     proc->proc_endpoint_count = 0;
  61     proc->proc_ep_match_table = NULL;
  62     proc->proc_match_exists = false;
  63 
  64     /* add to list of all proc instance */
  65     opal_list_append(&mca_btl_usnic_component.usnic_procs, &proc->super);
  66 }
  67 
  68 
  69 static void proc_destruct(opal_btl_usnic_proc_t* proc)
  70 {
  71     /* remove from list of all proc instances */
  72     opal_list_remove_item(&mca_btl_usnic_component.usnic_procs, &proc->super);
  73 
  74     /* release resources */
  75     if (NULL != proc->proc_modex) {
  76         free(proc->proc_modex);
  77         proc->proc_modex = NULL;
  78     }
  79 
  80     if (NULL != proc->proc_modex_claimed) {
  81         free(proc->proc_modex_claimed);
  82         proc->proc_modex_claimed = NULL;
  83     }
  84 
  85     if (NULL != proc->proc_ep_match_table) {
  86         free(proc->proc_ep_match_table);
  87         proc->proc_ep_match_table = NULL;
  88     }
  89 
  90     /* Release all endpoints associated with this proc */
  91     if (NULL != proc->proc_endpoints) {
  92         free(proc->proc_endpoints);
  93         proc->proc_endpoints = NULL;
  94     }
  95 }
  96 
  97 
  98 OBJ_CLASS_INSTANCE(opal_btl_usnic_proc_t,
  99                    opal_list_item_t,
 100                    proc_construct,
 101                    proc_destruct);
 102 
 103 /*
 104  * Look for an existing usnic process instance based on the
 105  * associated opal_proc_t instance.
 106  */
 107 opal_btl_usnic_proc_t *
 108 opal_btl_usnic_proc_lookup_ompi(opal_proc_t* opal_proc)
 109 {
 110     opal_btl_usnic_proc_t* usnic_proc;
 111 
 112     for (usnic_proc = (opal_btl_usnic_proc_t*)
 113              opal_list_get_first(&mca_btl_usnic_component.usnic_procs);
 114          usnic_proc != (opal_btl_usnic_proc_t*)
 115              opal_list_get_end(&mca_btl_usnic_component.usnic_procs);
 116          usnic_proc  = (opal_btl_usnic_proc_t*)
 117              opal_list_get_next(usnic_proc)) {
 118         if (usnic_proc->proc_opal == opal_proc) {
 119             return usnic_proc;
 120         }
 121     }
 122 
 123     return NULL;
 124 }
 125 
 126 
 127 /*
 128  * Look for an existing usnic proc based on a hashed RTE process
 129  * name.
 130  */
 131 opal_btl_usnic_endpoint_t *
 132 opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t *receiver,
 133                                     uint64_t sender_proc_name)
 134 {
 135     opal_btl_usnic_proc_t *proc;
 136     opal_btl_usnic_endpoint_t *endpoint;
 137     opal_list_item_t *item;
 138 
 139     MSGDEBUG1_OUT("lookup_endpoint: recvmodule=%p sendhash=0x%" PRIx64,
 140                   (void *)receiver, sender_proc_name);
 141 
 142     opal_mutex_lock(&receiver->all_endpoints_lock);
 143     for (item = opal_list_get_first(&receiver->all_endpoints);
 144          item != opal_list_get_end(&receiver->all_endpoints);
 145          item = opal_list_get_next(item)) {
 146         endpoint = container_of(item, opal_btl_usnic_endpoint_t,
 147                                 endpoint_endpoint_li);
 148         proc = endpoint->endpoint_proc;
 149         /* Note that this works today because opal_proc_t->proc_name
 150            is unique across the universe.  George is potentially
 151            working to give handles instead of proc names, and then
 152            have a function pointer to perform comparisons.  This would
 153            be bad here in the critical path, though... */
 154         if (usnic_compat_rte_hash_name(&(proc->proc_opal->proc_name)) ==
 155             sender_proc_name) {
 156             MSGDEBUG1_OUT("lookup_endpoint: matched endpoint=%p",
 157                           (void *)endpoint);
 158             opal_mutex_unlock(&receiver->all_endpoints_lock);
 159             return endpoint;
 160         }
 161     }
 162     opal_mutex_unlock(&receiver->all_endpoints_lock);
 163 
 164     /* Didn't find it */
 165     return NULL;
 166 }
 167 
 168 /*
 169  * Create an opal_btl_usnic_proc_t and initialize it with modex info
 170  * and an empty array of endpoints.
 171  *
 172  * Returns OPAL_ERR_UNREACH if we can't reach the peer (i.e., we can't
 173  * find their modex data).
 174  */
 175 static int create_proc(opal_proc_t *opal_proc,
 176                        opal_btl_usnic_proc_t **usnic_proc)
 177 {
 178     opal_btl_usnic_proc_t *proc = NULL;
 179     size_t size;
 180     int rc;
 181 
 182     *usnic_proc = NULL;
 183 
 184     /* Create the proc if it doesn't already exist */
 185     proc = OBJ_NEW(opal_btl_usnic_proc_t);
 186     if (NULL == proc) {
 187         return OPAL_ERR_OUT_OF_RESOURCE;
 188     }
 189 
 190     /* Initialize number of peers */
 191     proc->proc_endpoint_count = 0;
 192     proc->proc_opal = opal_proc;
 193 
 194     /* query for the peer address info */
 195     usnic_compat_modex_recv(&rc, &mca_btl_usnic_component.super.btl_version,
 196                             opal_proc, &proc->proc_modex, &size);
 197 
 198     /* If this proc simply doesn't have this key, then they're not
 199        running the usnic BTL -- just ignore them.  Otherwise, show an
 200        error message. */
 201     if (OPAL_ERR_NOT_FOUND == rc) {
 202         OBJ_RELEASE(proc);
 203         return OPAL_ERR_UNREACH;
 204     } else if (OPAL_SUCCESS != rc) {
 205         opal_show_help("help-mpi-btl-usnic.txt",
 206                        "internal error during init",
 207                        true,
 208                        opal_process_info.nodename,
 209                        "<none>", "<none>",
 210                        "opal_modex_recv() failed", __FILE__, __LINE__,
 211                        opal_strerror(rc));
 212         OBJ_RELEASE(proc);
 213         return OPAL_ERROR;
 214     }
 215 
 216     if ((size % sizeof(opal_btl_usnic_modex_t)) != 0) {
 217         char msg[1024];
 218 
 219         snprintf(msg, sizeof(msg),
 220                  "sizeof(modex for peer %s data) == %d, expected multiple of %d",
 221                  usnic_compat_proc_name_print(&opal_proc->proc_name),
 222                  (int) size, (int) sizeof(opal_btl_usnic_modex_t));
 223         opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
 224                        true,
 225                        opal_process_info.nodename,
 226                        "<none>", 0,
 227                        "invalid modex data", __FILE__, __LINE__,
 228                        msg);
 229 
 230         OBJ_RELEASE(proc);
 231         return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
 232     }
 233 
 234     /* See if the peer has the same underlying wire protocol as me.
 235        If not, then print an error and ignore this peer. */
 236 // RFXXX - things are weird when i force this to fail
 237     if (mca_btl_usnic_component.transport_protocol !=
 238         proc->proc_modex->protocol) {
 239         uint64_t proto;
 240         char protostr[32];
 241         proto = mca_btl_usnic_component.transport_protocol;
 242         memset(protostr, 0, sizeof(protostr));
 243         opal_string_copy(protostr, fi_tostr(&proto, FI_TYPE_PROTOCOL),
 244                 sizeof(protostr));
 245         proto = proc->proc_modex->protocol;
 246         opal_show_help("help-mpi-btl-usnic.txt",
 247                        "transport mismatch",
 248                        true,
 249                        opal_process_info.nodename,
 250                        protostr,
 251                        "peer",
 252                        fi_tostr(&proto, FI_TYPE_PROTOCOL));
 253 
 254         OBJ_RELEASE(proc);
 255         return OPAL_ERR_UNREACH;
 256     }
 257 
 258     proc->proc_modex_count = size / sizeof(opal_btl_usnic_modex_t);
 259     if (0 == proc->proc_modex_count) {
 260         proc->proc_endpoints = NULL;
 261         OBJ_RELEASE(proc);
 262         return OPAL_ERR_UNREACH;
 263     }
 264 
 265     proc->proc_modex_claimed = (bool*)
 266         calloc(proc->proc_modex_count, sizeof(bool));
 267     if (NULL == proc->proc_modex_claimed) {
 268         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 269         OBJ_RELEASE(proc);
 270         return OPAL_ERR_OUT_OF_RESOURCE;
 271     }
 272 
 273     proc->proc_endpoints = (mca_btl_base_endpoint_t**)
 274         calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
 275     if (NULL == proc->proc_endpoints) {
 276         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 277         OBJ_RELEASE(proc);
 278         return OPAL_ERR_OUT_OF_RESOURCE;
 279     }
 280 
 281     *usnic_proc = proc;
 282     return OPAL_SUCCESS;
 283 }
 284 
 285 /* Compare the addresses of the local interface corresponding to module and the
 286  * remote interface corresponding to proc_modex_addr.  Returns a weight value
 287  * (higher values indicate more desirable connections). */
 288 static uint64_t compute_weight(
 289     opal_btl_usnic_module_t *module,
 290     opal_btl_usnic_modex_t *proc_modex_addr)
 291 {
 292     char my_ip_string[INET_ADDRSTRLEN], peer_ip_string[INET_ADDRSTRLEN];
 293     struct sockaddr_in sin;
 294     struct sockaddr_in *sinp;
 295     struct fi_usnic_info *uip;
 296     uint32_t mynet, peernet;
 297     int err;
 298     int metric;
 299     uint32_t min_link_speed_gbps;
 300 
 301     uip = &module->usnic_info;
 302     sinp = module->fabric_info->src_addr;
 303     inet_ntop(AF_INET, &sinp->sin_addr,
 304               my_ip_string, sizeof(my_ip_string));
 305     inet_ntop(AF_INET, &proc_modex_addr->ipv4_addr,
 306               peer_ip_string, sizeof(peer_ip_string));
 307 
 308     /* Just compare the CIDR-masked IP address to see if they're on
 309        the same network.  If so, we're good. */
 310     mynet = sinp->sin_addr.s_addr & uip->ui.v1.ui_netmask_be;
 311     peernet = proc_modex_addr->ipv4_addr & proc_modex_addr->netmask;
 312     opal_output_verbose(5, USNIC_OUT,
 313                         "btl:usnic:%s: checking my IP address/subnet (%s/%d) vs. peer (%s/%d): %s",
 314                         __func__, my_ip_string,
 315                         usnic_netmask_to_cidrlen(uip->ui.v1.ui_netmask_be),
 316                         peer_ip_string,
 317                         usnic_netmask_to_cidrlen(proc_modex_addr->netmask),
 318                         (mynet == peernet ? "match" : "DO NOT match"));
 319 
 320     min_link_speed_gbps = MIN(module->super.btl_bandwidth,
 321                               proc_modex_addr->link_speed_mbps) / 1000;
 322 
 323     /* Returned metric is:
 324      *    0 - same VLAN
 325      *    1..MAXINT - relative distance metric
 326      *    -1 - unreachable
 327      */
 328     metric = 0;
 329     memset(&sin, 0, sizeof(sin));
 330     sin.sin_family = AF_INET;
 331     sin.sin_addr.s_addr = proc_modex_addr->ipv4_addr;
 332     err = module->usnic_av_ops->get_distance(module->av, &sin, &metric);
 333     if (0 != err || (0 == err && -1 == metric)) {
 334         return 0; /* no connectivity */
 335     }
 336     else {
 337         /* Format in binary    MSB                             LSB
 338          * most sig. 32-bits:  00000000 0000000A BBBBBBBB 00000001
 339          * least sig. 32-bits: CCCCCCCC CCCCCCCC CCCCCCCC CCCCCCCC
 340          *
 341          * A = 1 iff same subnet
 342          * B = min link speed (in Gbps) between iface pair
 343          * C = metric from routing table
 344          *
 345          * That is, this prioritizes interfaces in the same subnet first,
 346          * followed by having the same link speed.  The extra literal "1" is in
 347          * there to help prioritize over any zero-cost links that might
 348          * otherwise make their way into the graph.  It is not strictly
 349          * necessary and could be eliminated if the extra byte is needed.
 350          *
 351          * TODO add an MCA parameter to optionally swap the offsets of A and
 352          * B, thereby prioritizing link speed over same subnet reachability.
 353          */
 354         /* FIXME how can we check that the metric is the same before we have
 355          * communication with this host?  Mismatched metrics could cause the
 356          * remote peer to make a different pairing decision... */
 357         if (min_link_speed_gbps > 0xff) {
 358             opal_output_verbose(20, USNIC_OUT, "clamping min_link_speed_gbps=%u to 255",
 359                                 min_link_speed_gbps);
 360             min_link_speed_gbps = 0xff;
 361         }
 362         return ((uint64_t)(mynet == peernet) << 48) |
 363                ((uint64_t)(min_link_speed_gbps & 0xff) << 40) |
 364                ((uint64_t)0x1 << 32) |
 365                (/*metric=*/0);
 366     }
 367 }
 368 
 369 /* Populate the given proc's match table from an array of (u,v) edge pairs.
 370  *
 371  * (DJG: this unfortunately knows a bit too much about the internals of
 372  * "match_modex")
 373  */
 374 static void edge_pairs_to_match_table(
 375     opal_btl_usnic_proc_t *proc,
 376     bool proc_is_left,
 377     int nme,
 378     int *me)
 379 {
 380     int i;
 381     int left, right;
 382     int module_idx, proc_idx;
 383     int num_modules;
 384 
 385     num_modules = (int)mca_btl_usnic_component.num_modules;
 386 
 387     assert(nme >= 0);
 388     for (i = 0; i < nme; ++i) {
 389         left  = me[2*i+0];
 390         right = me[2*i+1];
 391 
 392         if (proc_is_left) {
 393             proc_idx = PROC_INDEX(left);
 394             module_idx = MODULE_INDEX(right);
 395         } else {
 396             module_idx = MODULE_INDEX(left);
 397             proc_idx = PROC_INDEX(right);
 398         }
 399         assert(module_idx >= 0 && module_idx < num_modules);
 400         assert(proc_idx >= 0 && proc_idx < (int)proc->proc_modex_count);
 401         proc->proc_ep_match_table[module_idx] = proc_idx;
 402         proc->proc_match_exists = true;
 403     }
 404 
 405     /* emit match summary for debugging purposes */
 406     for (i = 0; i < num_modules; ++i) {
 407         if (-1 != proc->proc_ep_match_table[i]) {
 408             opal_output_verbose(5, USNIC_OUT,
 409                                 "btl:usnic:%s: module[%d] (%p) should claim endpoint[%d] on proc %p",
 410                                 __func__, i,
 411                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
 412                                 proc->proc_ep_match_table[i], (void *)proc);
 413         } else {
 414             opal_output_verbose(5, USNIC_OUT,
 415                                 "btl:usnic:%s: module[%d] (%p) will NOT claim an endpoint on proc %p",
 416                                 __func__, i,
 417                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
 418                                 (void *)proc);
 419         }
 420     }
 421 }
 422 
 423 /**
 424  * Constructs an interface graph from all local modules and the given proc's
 425  * remote interfaces.  The resulting vertices will always have the module
 426  * vertices appear before the proc vertices.
 427  */
 428 static int create_proc_module_graph(
 429     opal_btl_usnic_proc_t *proc,
 430     bool proc_is_left,
 431     opal_bp_graph_t **g_out)
 432 {
 433     int err;
 434     int i, j;
 435     int u, v;
 436     int num_modules;
 437     opal_bp_graph_t *g = NULL;
 438 
 439     if (NULL == g_out) {
 440         return OPAL_ERR_BAD_PARAM;
 441     }
 442     *g_out = NULL;
 443 
 444     num_modules = (int)mca_btl_usnic_component.num_modules;
 445 
 446     /* Construct a bipartite graph with remote interfaces on the one side and
 447      * local interfaces (modules) on the other. */
 448     err = opal_bp_graph_create(NULL, NULL, &g);
 449     if (OPAL_SUCCESS != err) {
 450         OPAL_ERROR_LOG(err);
 451         goto out;
 452     }
 453 
 454     /* create vertices for each interface (local and remote) */
 455     for (i = 0; i < num_modules; ++i) {
 456         int idx = -1;
 457         err = opal_bp_graph_add_vertex(g,
 458                                        mca_btl_usnic_component.usnic_active_modules[i],
 459                                        &idx);
 460         if (OPAL_SUCCESS != err) {
 461             OPAL_ERROR_LOG(err);
 462             goto out_free_graph;
 463         }
 464         assert(idx == MODULE_VERTEX(i));
 465     }
 466     for (i = 0; i < (int)proc->proc_modex_count; ++i) {
 467         int idx = -1;
 468         err = opal_bp_graph_add_vertex(g, &proc->proc_modex[i], &idx);
 469         if (OPAL_SUCCESS != err) {
 470             OPAL_ERROR_LOG(err);
 471             goto out_free_graph;
 472         }
 473         assert(idx == (int)PROC_VERTEX(i));
 474     }
 475 
 476     /* now add edges between interfaces that can communicate */
 477     for (i = 0; i < num_modules; ++i) {
 478         for (j = 0; j < (int)proc->proc_modex_count; ++j) {
 479             int64_t weight, cost;
 480 
 481             /* assumption: compute_weight returns the same weight on the
 482              * remote process with these arguments (effectively) transposed */
 483             weight = compute_weight(mca_btl_usnic_component.usnic_active_modules[i],
 484                                     &proc->proc_modex[j]);
 485 
 486             opal_output_verbose(20, USNIC_OUT,
 487                                 "btl:usnic:%s: weight=0x%016" PRIx64 " for edge module[%d] (%p) <--> endpoint[%d] on proc %p",
 488                                 __func__,
 489                                 weight, i,
 490                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
 491                                 j, (void *)proc);
 492 
 493             if (WEIGHT_UNREACHABLE == weight) {
 494                 continue;
 495             } else {
 496                 /* the graph code optimizes for minimum *cost*, but we have
 497                  * been computing weights (negative costs) */
 498                 cost = -weight;
 499             }
 500             assert(INT64_MAX != cost);
 501             assert(INT64_MIN != cost);
 502 
 503             if (proc_is_left) {
 504                 u = PROC_VERTEX(j);
 505                 v = MODULE_VERTEX(i);
 506             } else {
 507                 u = MODULE_VERTEX(i);
 508                 v = PROC_VERTEX(j);
 509             }
 510             opal_output_verbose(20, USNIC_OUT,
 511                                 "btl:usnic:%s: adding edge (%d,%d) with cost=%" PRIi64 " for edge module[%d] <--> endpoint[%d]",
 512                                 __func__, u, v, cost, i, j);
 513             err = opal_bp_graph_add_edge(g, u, v, cost,
 514                                          /*capacity=*/1,
 515                                          /*e_data=*/NULL);
 516             if (OPAL_SUCCESS != err) {
 517                 OPAL_ERROR_LOG(err);
 518                 goto out_free_graph;
 519             }
 520         }
 521     }
 522 
 523     *g_out = g;
 524     return OPAL_SUCCESS;
 525 
 526 out_free_graph:
 527     opal_bp_graph_free(g);
 528 out:
 529     return err;
 530 }
 531 
 532 /*
 533  * For a specific module, see if this proc has matching address/modex
 534  * info.  If so, create an endpoint and return it.
 535  *
 536  * Implementation note: This code relies on the order of modules on a local
 537  * side matching the order of the modex entries that we send around, otherwise
 538  * both sides may not agree on a bidirectional connection.  It also assumes
 539  * that add_procs will be invoked on the local modules in that same order, for
 540  * the same reason.  If those assumptions do not hold, we will need to
 541  * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
 542  * by the interface MAC or IP address.
 543  */
 544 static int match_modex(opal_btl_usnic_module_t *module,
 545                        opal_btl_usnic_proc_t *proc,
 546                        int *index_out)
 547 {
 548     int err = OPAL_SUCCESS;
 549     size_t i;
 550     uint32_t num_modules;
 551     opal_bp_graph_t *g = NULL;
 552     bool proc_is_left;
 553 
 554     if (NULL == index_out) {
 555         return OPAL_ERR_BAD_PARAM;
 556     }
 557     *index_out = -1;
 558 
 559     num_modules = mca_btl_usnic_component.num_modules;
 560 
 561     opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
 562                         __func__, (void *)module, (void *)proc,
 563                         num_modules, (int)proc->proc_modex_count);
 564 
 565     /* We compute an interface match-up table once for each (module,proc) pair
 566      * and cache it in the proc.  Store per-proc instead of per-module, since
 567      * MPI dynamic process routines can add procs but not new modules. */
 568     if (NULL == proc->proc_ep_match_table) {
 569         proc->proc_ep_match_table = malloc(num_modules *
 570                                        sizeof(*proc->proc_ep_match_table));
 571         if (NULL == proc->proc_ep_match_table) {
 572             OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 573             return OPAL_ERR_OUT_OF_RESOURCE;
 574         }
 575 
 576         /* initialize to "no matches" */
 577         for (i = 0; i < num_modules; ++i) {
 578             proc->proc_ep_match_table[i] = -1;
 579         }
 580 
 581         /* For graphs where all edges are equal (and even for some other
 582          * graphs), two peers making matching calculations with "mirror image"
 583          * graphs might not end up with the same matching.  Ensure that both
 584          * sides are always setting up the exact same graph by always putting
 585          * the process with the lower (jobid,vpid) on the "left".
 586          */
 587 #if 0
 588         proc_is_left = (proc->proc_opal->proc_name <
 589                         opal_proc_local_get()->proc_name);
 590 #else
 591         proc_is_left =
 592             usnic_compat_proc_name_compare(proc->proc_opal->proc_name,
 593                                            opal_proc_local_get()->proc_name);
 594 #endif
 595 
 596         err = create_proc_module_graph(proc, proc_is_left, &g);
 597         if (OPAL_SUCCESS != err) {
 598             goto out_free_table;
 599         }
 600 
 601         int nme = 0;
 602         int *me = NULL;
 603         err = opal_bp_graph_solve_bipartite_assignment(g, &nme, &me);
 604         if (OPAL_SUCCESS != err) {
 605             OPAL_ERROR_LOG(err);
 606             goto out_free_graph;
 607         }
 608 
 609         edge_pairs_to_match_table(proc, proc_is_left, nme, me);
 610         free(me);
 611 
 612         err = opal_bp_graph_free(g);
 613         if (OPAL_SUCCESS != err) {
 614             OPAL_ERROR_LOG(err);
 615             return err;
 616         }
 617     }
 618 
 619 
 620     if (!proc->proc_match_exists) {
 621         opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
 622                             __func__,
 623                             usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
 624         return OPAL_ERR_NOT_FOUND;
 625     }
 626 
 627     /* assuming no strange failure cases, this should always be present */
 628     if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
 629         for (i = 0; i < num_modules; ++i) {
 630             if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
 631                 *index_out = proc->proc_ep_match_table[i];
 632                 break;
 633             }
 634         }
 635     }
 636 
 637     /* If MTU does not match, throw an error */
 638     /* TODO with UDP, do we still want to enforce this restriction or just take
 639      * the min of the two MTUs?  Another choice is to disqualify this pairing
 640      * before running the matching algorithm on it. */
 641     if (*index_out >= 0 &&
 642         proc->proc_modex[*index_out].max_msg_size !=
 643         (uint16_t) module->fabric_info->ep_attr->max_msg_size) {
 644         opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
 645                        true,
 646                        opal_process_info.nodename,
 647                        module->linux_device_name,
 648                        module->fabric_info->ep_attr->max_msg_size,
 649                        (NULL == proc->proc_opal->proc_hostname) ?
 650                        "unknown" : proc->proc_opal->proc_hostname,
 651                        proc->proc_modex[*index_out].max_msg_size);
 652         *index_out = -1;
 653         return OPAL_ERR_UNREACH;
 654     }
 655 
 656     return (*index_out == -1 ? OPAL_ERR_NOT_FOUND : OPAL_SUCCESS);
 657 
 658 out_free_graph:
 659     opal_bp_graph_free(g);
 660 out_free_table:
 661     free(proc->proc_ep_match_table);
 662     proc->proc_ep_match_table = NULL;
 663     proc->proc_match_exists = false;
 664     return err;
 665 }
 666 
 667 /*
 668  * Initiate the process to create a USD dest.
 669  * It will be polled for completion later.
 670  */
 671 static int start_av_insert(opal_btl_usnic_module_t *module,
 672                                   opal_btl_usnic_endpoint_t *endpoint,
 673                                   int channel)
 674 {
 675     int ret;
 676     opal_btl_usnic_modex_t *modex = &endpoint->endpoint_remote_modex;
 677     opal_btl_usnic_addr_context_t *context;
 678     struct sockaddr_in sin;
 679 
 680     context = calloc(1, sizeof(*context));
 681     context->endpoint = endpoint;
 682     context->channel_id = channel;
 683 
 684     char str[IPV4STRADDRLEN];
 685     opal_btl_usnic_snprintf_ipv4_addr(str, sizeof(str), modex->ipv4_addr,
 686                                       modex->netmask);
 687     opal_output_verbose(5, USNIC_OUT,
 688                         "btl:usnic:start_av_insert: to channel %d at %s:%d",
 689                         channel, str, modex->ports[channel]);
 690 
 691     /* build remote address */
 692     memset(&sin, 0, sizeof(sin));
 693     sin.sin_family = AF_INET;
 694     sin.sin_port = htons(modex->ports[channel]);
 695     sin.sin_addr.s_addr = modex->ipv4_addr;
 696 
 697     ret = fi_av_insert(module->av, &sin, 1,
 698             &endpoint->endpoint_remote_addrs[channel], 0, context);
 699     /* Did an error occur? */
 700     if (0 != ret) {
 701         opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
 702                        true,
 703                        opal_process_info.nodename,
 704                        module->linux_device_name,
 705                        "fi_av_insert()", __FILE__, __LINE__,
 706                        ret,
 707                        "Failed to initiate AV insert");
 708         free(context);
 709         return OPAL_ERROR;
 710     }
 711 
 712     return OPAL_SUCCESS;
 713 }
 714 
 715 /*
 716  * Create an endpoint and claim the matched modex slot
 717  */
 718 int
 719 opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module,
 720                 opal_btl_usnic_proc_t *proc,
 721                 opal_btl_usnic_endpoint_t **endpoint_o)
 722 {
 723     int rc;
 724     int modex_index;
 725     opal_btl_usnic_endpoint_t *endpoint;
 726 
 727     /* look for matching modex info */
 728     rc = match_modex(module, proc, &modex_index);
 729     if (OPAL_SUCCESS != rc) {
 730         opal_output_verbose(5, USNIC_OUT,
 731                             "btl:usnic:create_endpoint: did not match usnic modex info for peer %s",
 732                             usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
 733         return rc;
 734     }
 735 
 736     endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t);
 737     if (NULL == endpoint) {
 738         return OPAL_ERR_OUT_OF_RESOURCE;
 739     }
 740 
 741     /* Initalize the endpoint */
 742     endpoint->endpoint_module = module;
 743     assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count);
 744     endpoint->endpoint_remote_modex = proc->proc_modex[modex_index];
 745     endpoint->endpoint_send_credits = module->sd_num;
 746 
 747     /* Start creating destinations; one for each channel.  These
 748        progress in the background.a */
 749     for (int i = 0; i < USNIC_NUM_CHANNELS; ++i)  {
 750         rc = start_av_insert(module, endpoint, i);
 751         if (OPAL_SUCCESS != rc) {
 752             OBJ_RELEASE(endpoint);
 753             return rc;
 754         }
 755     }
 756 
 757     /* Initialize endpoint sequence number info */
 758     endpoint->endpoint_next_seq_to_send = module->local_modex.isn;
 759     endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1;
 760     endpoint->endpoint_next_contig_seq_to_recv =
 761         endpoint->endpoint_remote_modex.isn;
 762     endpoint->endpoint_highest_seq_rcvd =
 763         endpoint->endpoint_next_contig_seq_to_recv - 1;
 764     endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
 765 
 766     /* Now claim that modex slot */
 767     proc->proc_modex_claimed[modex_index] = true;
 768     MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n",
 769                   (void *)module, (void *)endpoint, (void *)proc,
 770                   proc->proc_opal->proc_name);
 771 
 772     /* Save the endpoint on this proc's array of endpoints */
 773     proc->proc_endpoints[proc->proc_endpoint_count] = endpoint;
 774     endpoint->endpoint_proc_index = proc->proc_endpoint_count;
 775     endpoint->endpoint_proc = proc;
 776     ++proc->proc_endpoint_count;
 777     OBJ_RETAIN(proc);
 778 
 779     /* also add endpoint to module's list of endpoints (done here and
 780        not in the endpoint constructor because we aren't able to pass
 781        the module as a constructor argument -- doh!). */
 782     opal_mutex_lock(&module->all_endpoints_lock);
 783     opal_list_append(&(module->all_endpoints),
 784             &(endpoint->endpoint_endpoint_li));
 785     endpoint->endpoint_on_all_endpoints = true;
 786     opal_mutex_unlock(&module->all_endpoints_lock);
 787 
 788     *endpoint_o = endpoint;
 789     return OPAL_SUCCESS;
 790 }
 791 
 792 /*
 793  * If we haven't done so already, receive the modex info for the
 794  * specified opal_proc.  Search that proc's modex info; if we can find
 795  * matching address info, then create an endpoint.
 796  *
 797  * If we don't find a match, it's not an error: just return "not
 798  * found".
 799  *
 800  * This routine transfers ownership of an object reference to the caller, who
 801  * is eventually responsible for transferring or releasing that reference.
 802  *
 803  * There is a one-to-one correspondence between a opal_proc_t and a
 804  * opal_btl_usnic_proc_t instance.  We cache additional data on the
 805  * opal_btl_usnic_proc_t: specifically, the list of
 806  * opal_btl_usnic_endpoint_t instances, and published addresses/modex
 807  * info.
 808  */
 809 int opal_btl_usnic_proc_match(opal_proc_t *opal_proc,
 810                               opal_btl_usnic_module_t *module,
 811                               opal_btl_usnic_proc_t **proc)
 812 {
 813     /* Check if we have already created a proc structure for this peer
 814        ompi process */
 815     *proc = opal_btl_usnic_proc_lookup_ompi(opal_proc);
 816     if (*proc != NULL) {
 817         OBJ_RETAIN(*proc);
 818         return OPAL_SUCCESS;
 819     } else {
 820         /* If not, go make one */
 821         return create_proc(opal_proc, proc);
 822     }
 823 }

/* [<][>][^][v][top][bottom][index][help] */