root/opal/mca/btl/usnic/btl_usnic_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. channel_addr2str
  2. add_procs_block_create_endpoints
  3. add_procs_warn_unreachable
  4. add_procs_block_reap_fi_av_inserts
  5. add_procs_create_endpoints
  6. usnic_add_procs
  7. usnic_del_procs
  8. usnic_register_pml_err_cb
  9. usnic_alloc
  10. usnic_free
  11. pack_chunk_seg_from_frag
  12. usnic_finalize
  13. get_send_credits
  14. usnic_do_resends
  15. usnic_handle_large_send
  16. opal_btl_usnic_module_progress_sends
  17. usnic_send
  18. usnic_sendi
  19. usnic_reg_mr
  20. usnic_dereg_mr
  21. module_async_event_callback
  22. create_ep
  23. finalize_one_channel
  24. init_one_channel
  25. get_initial_seq_no
  26. init_module_globals
  27. init_local_modex_part1
  28. init_find_transport_header_len
  29. init_queue_lengths
  30. init_payload_lengths
  31. init_pml_values
  32. init_senders
  33. init_connectivity_checker
  34. init_hwloc
  35. init_procs
  36. init_mpool
  37. init_channels
  38. init_local_modex_part2
  39. init_async_event
  40. init_random_objects
  41. init_freelists
  42. opal_btl_usnic_module_init
  43. usnic_ft_event

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2008 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006      Sandia National Laboratories. All rights
  14  *                         reserved.
  15  * Copyright (c) 2009-2019 Cisco Systems, Inc.  All rights reserved
  16  * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
  17  *                         reserved.
  18  * Copyright (c) 2014      Intel, Inc. All rights reserved
  19  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 
  27 #include "opal_config.h"
  28 
  29 #include <errno.h>
  30 #include <string.h>
  31 #include <unistd.h>
  32 #include <stdlib.h>
  33 #include <time.h>
  34 
  35 #include "opal_stdint.h"
  36 #include "opal/class/opal_bitmap.h"
  37 #include "opal/prefetch.h"
  38 #include "opal/util/output.h"
  39 #include "opal/datatype/opal_convertor.h"
  40 #include "opal/util/show_help.h"
  41 #include "opal/util/printf.h"
  42 #include "opal/mca/memchecker/base/base.h"
  43 
  44 #include "opal/mca/btl/btl.h"
  45 #include "opal/mca/btl/base/btl_base_error.h"
  46 #include "opal/mca/mpool/base/base.h"
  47 #include "opal/mca/mpool/mpool.h"
  48 #include "opal/mca/rcache/base/base.h"
  49 #include "opal/mca/rcache/rcache.h"
  50 
  51 #include "btl_usnic_compat.h"
  52 #include "btl_usnic.h"
  53 #include "btl_usnic_connectivity.h"
  54 #include "btl_usnic_frag.h"
  55 #include "btl_usnic_proc.h"
  56 #include "btl_usnic_endpoint.h"
  57 #include "btl_usnic_module.h"
  58 #include "btl_usnic_util.h"
  59 #include "btl_usnic_send.h"
  60 #include "btl_usnic_ack.h"
  61 #include "btl_usnic_hwloc.h"
  62 #include "btl_usnic_stats.h"
  63 
  64 static void finalize_one_channel(opal_btl_usnic_module_t *module,
  65                                  struct opal_btl_usnic_channel_t *channel);
  66 
  67 static int channel_addr2str(opal_btl_usnic_module_t *module, int channel,
  68                             char *str, size_t len_param)
  69 {
  70     size_t len;
  71 
  72     len = len_param;
  73     fi_av_straddr(module->av, module->mod_channels[channel].info->src_addr,
  74                   str, &len);
  75     if (len > len_param) {
  76         opal_show_help("help-mpi-btl-usnic.txt",
  77                        "libfabric API failed",
  78                        true,
  79                        opal_process_info.nodename,
  80                        module->linux_device_name,
  81                        "fi_av_straddr", __FILE__, __LINE__,
  82                        FI_ENODATA,
  83                        "Failed to convert address to string: buffer too short");
  84 
  85         return OPAL_ERR_OUT_OF_RESOURCE;
  86     }
  87 
  88     return OPAL_SUCCESS;
  89 }
  90 
  91 
  92 /*
  93  * Loop over a block of procs sent to us in add_procs and see if we
  94  * want to add a proc/endpoint for them.
  95  */
  96 static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
  97                                             size_t block_offset,
  98                                             size_t block_len,
  99                                             opal_proc_t **procs,
 100                                             mca_btl_base_endpoint_t **endpoints)
 101 {
 102     int rc;
 103     opal_proc_t* my_proc;
 104     size_t num_created = 0;
 105 
 106     /* get pointer to my proc structure */
 107     my_proc = opal_proc_local_get();
 108     if (NULL == my_proc) {
 109         return OPAL_ERR_OUT_OF_RESOURCE;
 110     }
 111 
 112     /* Loop over a block in the procs we were given */
 113     for (size_t i = block_offset; i < (block_offset + block_len); i++) {
 114         struct opal_proc_t* opal_proc = procs[i];
 115         opal_btl_usnic_proc_t* usnic_proc;
 116         mca_btl_base_endpoint_t* usnic_endpoint;
 117 
 118         endpoints[i] = NULL;
 119 
 120         /* Do not create loopback usnic connections */
 121         if (opal_proc == my_proc) {
 122             opal_output_verbose(75, USNIC_OUT,
 123                                 "btl:usnic:add_procs:%s: not connecting to self",
 124                                 module->linux_device_name);
 125             continue;
 126         }
 127 
 128         /* usNIC does not support loopback to the same machine */
 129         if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
 130             opal_output_verbose(75, USNIC_OUT,
 131                                 "btl:usnic:add_procs:%s: not connecting to %s on same server",
 132                                 module->linux_device_name,
 133                                 usnic_compat_proc_name_print(&opal_proc->proc_name));
 134             continue;
 135         }
 136 
 137         /* Find (or create if it doesn't exist) this peer's proc.
 138            This will receive the modex info for that proc.  Note that
 139            the proc is shared by all usnic modules that are trying
 140            to reach this destination. */
 141         usnic_proc = NULL;
 142         rc = opal_btl_usnic_proc_match(opal_proc, module, &usnic_proc);
 143         if (OPAL_ERR_UNREACH == rc) {
 144             /* If the peer doesn't have usnic modex info, then we just
 145                skip it */
 146             opal_output_verbose(75, USNIC_OUT,
 147                                 "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
 148                                 module->linux_device_name,
 149                                 usnic_compat_proc_name_print(&opal_proc->proc_name),
 150                                 opal_get_proc_hostname(opal_proc));
 151             continue;
 152         } else if (OPAL_SUCCESS != rc) {
 153             return OPAL_ERR_OUT_OF_RESOURCE;
 154         }
 155 
 156         /* Create the endpoint for this proc/module combination.  If we cannot
 157          * reach this proc via this module, move on to the next proc. */
 158         usnic_endpoint = NULL;
 159         rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
 160                                             &usnic_endpoint);
 161         if (OPAL_SUCCESS != rc) {
 162             opal_output_verbose(5, USNIC_OUT,
 163                                 "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
 164                                 module->linux_device_name,
 165                                 usnic_compat_proc_name_print(&opal_proc->proc_name),
 166                                 opal_get_proc_hostname(opal_proc));
 167             OBJ_RELEASE(usnic_proc);
 168             continue;
 169         }
 170 
 171         /* We like this new endpoint; save it */
 172         opal_pointer_array_add(&module->all_procs, usnic_proc);
 173 
 174         char str[IPV4STRADDRLEN];
 175         struct opal_btl_usnic_modex_t *modex =
 176             &usnic_endpoint->endpoint_remote_modex;
 177         opal_btl_usnic_snprintf_ipv4_addr(str, sizeof(str),
 178                                           modex->ipv4_addr,
 179                                           modex->netmask);
 180 
 181         char local_pri_addr[64] = {0};
 182         rc = channel_addr2str(module, USNIC_PRIORITY_CHANNEL,
 183                               local_pri_addr, sizeof(local_pri_addr));
 184         if (OPAL_SUCCESS != rc) {
 185             OBJ_RELEASE(usnic_proc);
 186             continue;
 187         }
 188 
 189         char local_data_addr[64] = {0};
 190         rc = channel_addr2str(module, USNIC_DATA_CHANNEL,
 191                               local_data_addr, sizeof(local_data_addr));
 192         if (OPAL_SUCCESS != rc) {
 193             OBJ_RELEASE(usnic_proc);
 194             continue;
 195         }
 196 
 197         opal_output_verbose(5, USNIC_OUT,
 198                             "btl:usnic:add_procs:%s: new usnic peer endpoint: pri=%s:%d, data=%s:%d (local: pri=%s, data=%s)",
 199                             module->linux_device_name,
 200                             str, modex->ports[USNIC_PRIORITY_CHANNEL],
 201                             str, modex->ports[USNIC_DATA_CHANNEL],
 202                             local_pri_addr,
 203                             local_data_addr);
 204 
 205         endpoints[i] = usnic_endpoint;
 206         ++num_created;
 207     }
 208 
 209     opal_output_verbose(5, USNIC_OUT,
 210                         "btl:usnic: made %" PRIsize_t " endpoints",
 211                         num_created);
 212     return OPAL_SUCCESS;
 213 }
 214 
 215 /*
 216  * Print a warning about how the remote peer was unreachable.
 217  *
 218  * This is a separate helper function simply because it's somewhat
 219  * bulky to put inline.
 220  */
 221 static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
 222                                        opal_btl_usnic_endpoint_t *endpoint)
 223 {
 224     /* Only show the warning if it is enabled */
 225     if (!mca_btl_usnic_component.show_route_failures) {
 226         return;
 227     }
 228 
 229     char remote[IPV4STRADDRLEN];
 230     opal_btl_usnic_snprintf_ipv4_addr(remote, sizeof(remote),
 231                                       endpoint->endpoint_remote_modex.ipv4_addr,
 232                                       endpoint->endpoint_remote_modex.netmask);
 233 
 234     opal_output_verbose(15, USNIC_OUT,
 235                         "btl:usnic: %s (which is %s) couldn't reach peer %s",
 236                         module->linux_device_name,
 237                         module->if_ipv4_addr_str,
 238                         remote);
 239     opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
 240                    true,
 241                    opal_process_info.nodename,
 242                    module->if_ipv4_addr_str,
 243                    module->linux_device_name,
 244                    opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
 245                    remote);
 246 }
 247 
 248 /* A bunch of calls to fi_av_insert() were previously
 249  * invoked.  Go reap them all.
 250  */
 251 static int
 252 add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
 253                                    size_t block_offset,
 254                                    size_t block_len,
 255                                    struct mca_btl_base_endpoint_t **endpoints)
 256 {
 257     int ret = OPAL_SUCCESS;
 258     int num_left;
 259     size_t i, channel;
 260     uint32_t event;
 261     struct fi_eq_entry entry;
 262     struct fi_eq_err_entry err_entry;
 263     bool error_occurred = false;
 264 
 265     /* compute num fi_av_insert completions we are waiting for */
 266     num_left = 0;
 267     for (i = block_offset; i < (block_offset + block_len); ++i) {
 268         if (NULL != endpoints[i]) {
 269             num_left += USNIC_NUM_CHANNELS;
 270         }
 271     }
 272 
 273     /* Loop polling for fi_av_insert completion (they were
 274        individually started in btl_usnic_proc.c) */
 275     while (num_left > 0) {
 276         opal_btl_usnic_addr_context_t *context;
 277 
 278         ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
 279 
 280         /* fi_eq_sread() will return ret==sizeof(entry) if there are
 281            no entries on the error queue and the event read completes
 282            successfully.  We'll get a non-error event back for every
 283            fi_av_insert(), even if that insert errors out. */
 284         if (sizeof(entry) == ret) {
 285             context = entry.context;
 286             free(context);
 287             --num_left;
 288             ret = 0;
 289         }
 290 
 291         /* fi_eq_sread() will return -FI_EAVAIL if there's something
 292            on the error queue.
 293 
 294            Note that if an fi_av_insert() fails, it will *first*
 295            return an entry on the error queue (i.e., have
 296            fi_eq_sread() return -FI_EVAIL), and *second* it will
 297            return an entry on the normal queue.  Meaning: the failed
 298            fi_av_insert() context will show up twice.  So don't free
 299            the context (or anything associated with it) here in this
 300            error case, because the same context will show up in the
 301            non-error case (above). */
 302         else if (-FI_EAVAIL == ret) {
 303             ret = fi_eq_readerr(module->av_eq, &err_entry, 0);
 304             if (sizeof(err_entry) == ret) {
 305                 context = err_entry.context;
 306 
 307                 /* Got some kind of address failure.  This usually means
 308                    that we couldn't find a route to that peer (e.g., the
 309                    networking is hosed between us).  So just mark that we
 310                    can't reach this peer, and print a pretty warning. */
 311                 if (EADDRNOTAVAIL == err_entry.err ||
 312                      EHOSTUNREACH == err_entry.err) {
 313 
 314                     /* Note that endpoint was passed in a context in
 315                        USNIC_NUM_CHANNELS fi_av_insert() calls.
 316                        Meaning: if that address fails to resolve,
 317                        we'll get USNIC_NUM_CHANNELS error events back
 318                        with a context containing that endpoint.
 319 
 320                        We therefore only want to print a pretty
 321                        warning about (and OBJ_RELEASE) that endpoint
 322                        the *first* time it is reported. */
 323                     for (i = block_offset; i < (block_offset + block_len); ++i) {
 324                         if (endpoints[i] == context->endpoint) {
 325                             add_procs_warn_unreachable(module,
 326                                                        context->endpoint);
 327                             OBJ_RELEASE(context->endpoint);
 328                             endpoints[i] = NULL;
 329                             break;
 330                         }
 331                     }
 332                     ret = 0;
 333                 }
 334 
 335                 /* Got some other kind of error -- give up on this
 336                    interface. */
 337                 else {
 338                     opal_show_help("help-mpi-btl-usnic.txt",
 339                                    "libfabric API failed",
 340                                    true,
 341                                    opal_process_info.nodename,
 342                                    module->linux_device_name,
 343                                    "async insertion result", __FILE__, __LINE__,
 344                                    err_entry.err,
 345                                    "Failed to insert address to AV");
 346                     ret = OPAL_ERR_OUT_OF_RESOURCE;
 347                     error_occurred = true;
 348 
 349                     /* we can't break here, need to finish reaping all
 350                        inserts */
 351                 }
 352 
 353                 /* Don't free the context or the endpoint -- events
 354                    that come in as errors will *also* come in as real
 355                    events */
 356 
 357             } else {
 358                 /* If we get here, it means fi_eq_readerr() failed
 359                    badly, which means something has gone tremendously
 360                    wrong.  Probably the only safe thing to do here is
 361                    exit. */
 362                 opal_show_help("help-mpi-btl-usnic.txt",
 363                                "internal error during init",
 364                                true,
 365                                opal_process_info.nodename,
 366                                module->linux_device_name,
 367                                "fi_eq_readerr()", __FILE__, __LINE__,
 368                                ret,
 369                                "Returned != sizeof(err_entry)");
 370                 ret = OPAL_ERR_OUT_OF_RESOURCE;
 371                 error_occurred = true;
 372 
 373                 /* Per above, there's really nothing sane left to do
 374                    but exit */
 375                 opal_btl_usnic_exit(module);
 376             }
 377         } else {
 378             /* If we get here, it means fi_eq_readerr() failed badly,
 379                which means something has gone tremendously wrong.
 380                Given that we're potentially not even all the way
 381                through MPI_INIT yet, the only sane thing to do here is
 382                exit. */
 383             opal_show_help("help-mpi-btl-usnic.txt",
 384                            "internal error during init",
 385                            true,
 386                            opal_process_info.nodename,
 387                            module->linux_device_name,
 388                            "fi_eq_sread()", __FILE__, __LINE__,
 389                            ret,
 390                            "Returned != (sizeof(entry) or -FI_EAVAIL)");
 391             ret = OPAL_ERR_OUT_OF_RESOURCE;
 392             error_occurred = true;
 393 
 394             /* Per above, there's really nothing sane left to do but
 395                exit */
 396             opal_btl_usnic_exit(module);
 397         }
 398     }
 399 
 400     /* Look through the list:
 401        - If something went wrong above, free all endpoints.
 402        - If an otherwise-valid endpoint has no dest, that means we timed
 403          out trying to resolve it, so just release that endpoint. */
 404     size_t num_endpoints_created = 0;
 405     for (i = block_offset; i < (block_offset + block_len); i++) {
 406         if (NULL != endpoints[i]) {
 407             bool happy;
 408 
 409             happy = true;
 410             if (error_occurred) {
 411                 happy = false;
 412             } else {
 413                 for (channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) {
 414                     if (FI_ADDR_NOTAVAIL ==
 415                             endpoints[i]->endpoint_remote_addrs[channel]) {
 416                         happy = false;
 417                         break;
 418                     }
 419                 }
 420             }
 421 
 422             if (happy) {
 423                 ++num_endpoints_created;
 424             } else {
 425                 OBJ_RELEASE(endpoints[i]);
 426                 endpoints[i] = NULL;
 427             }
 428         }
 429     }
 430 
 431     /* All done */
 432     opal_output_verbose(5, USNIC_OUT,
 433                         "btl:usnic: created destinations for %" PRIsize_t
 434                         " endpoints",
 435                         num_endpoints_created);
 436     return ret;
 437 }
 438 
 439 /*
 440  * Create endpoints for the procs we were given in add_procs.
 441  */
 442 static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
 443                                       size_t nprocs,
 444                                       struct opal_proc_t **procs,
 445                                       struct mca_btl_base_endpoint_t** endpoints)
 446 {
 447     /* We need to ensure that we don't overrun the libfabric AV EQ.
 448        Divide up all the peer address resolutions we need to do into a
 449        series of blocks; insert and complete each block before moving
 450        to the next (note: if performance mandates it, we can move to a
 451        sliding window style of AV inserts to get better concurrency of
 452        AV resolution). */
 453 
 454     /* Leave a few empty slots in the AV EQ, just for good measure */
 455     if (module->av_eq_size < 8) {
 456         opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
 457                        true,
 458                        opal_process_info.nodename,
 459                        module->av_eq_size,
 460                        8);
 461         return OPAL_ERR_OUT_OF_RESOURCE;
 462     }
 463 
 464     size_t eq_size = module->av_eq_size - 8;
 465     size_t block_len = eq_size;
 466     size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
 467     size_t num_blocks = num_av_inserts / block_len;
 468     if (num_av_inserts % block_len != 0) {
 469         ++num_blocks;
 470     }
 471 
 472     /* Per above, the blocks are expressed in terms of number of AV
 473        inserts.  Convert them to be expressed in terms of number of
 474        procs. */
 475     block_len /= USNIC_NUM_CHANNELS;
 476 
 477     /* Per above, loop over creating the endpoints so that we do not
 478        overrun the libfabric AV EQ. */
 479     int rc;
 480     for (size_t block_offset = 0, block = 0; block < num_blocks;
 481          block_offset += block_len, ++block) {
 482         /* Adjust for the last block */
 483         if (block_len > (nprocs - block_offset)) {
 484             block_len = nprocs - block_offset;
 485         }
 486 
 487         /* First, create endpoints (and procs, if they're not already
 488            created) for the usnic-reachable procs we were given. */
 489         rc = add_procs_block_create_endpoints(module,
 490                                               block_offset, block_len,
 491                                               procs, endpoints);
 492         if (OPAL_SUCCESS != rc) {
 493             return rc;
 494         }
 495 
 496         /* For each endpoint that was created, we initiated the
 497            process to create NUM_CHANNELS fi_addrs.  Go finish all of
 498            those.  This will be the final determination of whether we
 499            can use the endpoint or not because we'll find out if each
 500            endpoint is reachable or not. */
 501         rc = add_procs_block_reap_fi_av_inserts(module,
 502                                                 block_offset, block_len,
 503                                                 endpoints);
 504         if (OPAL_SUCCESS != rc) {
 505             return rc;
 506         }
 507     }
 508 
 509     return OPAL_SUCCESS;
 510 }
 511 
 512 /*
 513  * Add procs to this BTL module, receiving endpoint information from
 514  * the modex.  This is done in 2 phases:
 515  *
 516  * 1. Find (or create) the remote proc, and create the associated
 517  *    endpoint.
 518  * 2. Resolve the address handles for all remote endpoints.
 519  *
 520  * The second part is a separate loop from the first part to allow the
 521  * address lookups to be done in parallel.  This comes at a cost,
 522  * however: we may determine during the 2nd part that we should tear
 523  * down some or all the endpoints that we created in the 1st part.
 524  * For example, fi_av_insert() may fail in a fatal way (i.e., we
 525  * should fail the entire add_procs()), or it may fail for one or more
 526  * peers (i.e., we should just mark those peers as unreachable and not
 527  * add a proc or endpoint for them).
 528  */
 529 static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
 530                              size_t nprocs,
 531                              struct opal_proc_t **procs,
 532                              struct mca_btl_base_endpoint_t** endpoints,
 533                              opal_bitmap_t* reachable)
 534 {
 535     opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
 536     int rc;
 537 
 538     /* Go create the endpoints (including all relevant address
 539        resolution) */
 540     rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
 541     if (OPAL_SUCCESS != rc) {
 542         goto fail;
 543     }
 544 
 545     /* Find all the endpoints with a complete set of USD destinations
 546        and mark them as reachable */
 547     for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
 548         if (NULL != endpoints[i]) {
 549             bool happy = true;
 550             for (int channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) {
 551                 if (FI_ADDR_NOTAVAIL ==
 552                         endpoints[i]->endpoint_remote_addrs[channel]) {
 553                     happy = false;
 554                     break;
 555                 }
 556             }
 557 
 558             if (happy) {
 559                 opal_bitmap_set_bit(reachable, i);
 560             }
 561         }
 562     }
 563 
 564     /* This is fairly gross, but we need to output the connectivity
 565        map after add_procs() has been called on all existing usnic
 566        modules.  The only way I can think to do that is to count each
 567        time add_procs() is called, and when we're at a multiple of
 568        component.num_modules (i.e., add_procs() has been called on
 569        each module -- both during MPI_INIT and dynamic process cases),
 570        call the function to output the map. */
 571     static int num_times_add_procs_called = 0;
 572     ++num_times_add_procs_called;
 573     if (0 == (num_times_add_procs_called %
 574               mca_btl_usnic_component.num_modules)) {
 575         opal_btl_usnic_connectivity_map();
 576     }
 577 
 578     return OPAL_SUCCESS;
 579 
 580  fail:
 581     /* If we get here, it means something went terribly wrong.  Scorch
 582        the earth: destroy all endpoints and say that nothing was
 583        reachable. */
 584     for (size_t i = 0; i < nprocs; ++i) {
 585         if (NULL != endpoints[i]) {
 586             OBJ_RELEASE(endpoints[i]);
 587             endpoints[i] = NULL;
 588         }
 589     }
 590 
 591     return rc;
 592 }
 593 
 594 /*
 595  * Delete the proc as reachable from this module.  If there are
 596  * multiple usnic modules in a process, we'll come through here
 597  * multiple times to remove each proc.  The OBJ reference counts
 598  * will make all the details work out.
 599  */
 600 static int usnic_del_procs(struct mca_btl_base_module_t *base_module,
 601                              size_t nprocs,
 602                              struct opal_proc_t **procs,
 603                              struct mca_btl_base_endpoint_t **peers)
 604 {
 605     size_t i, j;
 606     opal_btl_usnic_module_t *module;
 607     opal_btl_usnic_endpoint_t *endpoint;
 608     int index;
 609 
 610     module = (struct opal_btl_usnic_module_t *)base_module;
 611 
 612     for (i = 0; i < nprocs; i++) {
 613         opal_btl_usnic_proc_t* proc =
 614             opal_btl_usnic_proc_lookup_ompi(procs[i]);
 615         if (NULL != proc) {
 616 
 617             /* find endpoint for this module */
 618             for (j = 0; j < proc->proc_endpoint_count; ++j) {
 619                 endpoint = proc->proc_endpoints[j];
 620                 if (NULL != endpoint && endpoint->endpoint_module == module) {
 621 
 622                     /* This call to usnic_del_procs is actually an
 623                      * implicit ACK of every packet we have ever sent
 624                      * ***because it is only ever invoked after an
 625                      * OOB/grpcomm barrier (in MPI_COMM_DISCONNECT and
 626                      * MPI_FINALIZE)***, so call handle_ack (via
 627                      * flush_endpoint) to do all the ACK processing
 628                      * and release all the data that needs
 629                      * releasing. */
 630                     if (!ENDPOINT_DRAINED(endpoint)) {
 631                         opal_btl_usnic_flush_endpoint(endpoint);
 632                     }
 633 
 634                     /* We're all done with this endpoint */
 635                     OBJ_RELEASE(endpoint);
 636 
 637                     break;  /* done once we found match */
 638                 }
 639             }
 640 
 641             /* remove proc from this module, and decrement its refcount */
 642             for (index = 0; index < module->all_procs.size; ++index) {
 643                 if (opal_pointer_array_get_item(&module->all_procs, index) ==
 644                         proc) {
 645                     OBJ_RELEASE(proc);
 646                     opal_pointer_array_set_item(&module->all_procs, index,
 647                             NULL);
 648                     break;
 649                 }
 650             }
 651         }
 652     }
 653 
 654     return OPAL_SUCCESS;
 655 }
 656 
 657 
 658 /*
 659  * Let the PML register a callback function with me
 660  */
 661 static int usnic_register_pml_err_cb(struct mca_btl_base_module_t* btl,
 662                                      mca_btl_base_module_error_cb_fn_t cbfunc)
 663 {
 664     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) btl;
 665 
 666     module->pml_error_callback = cbfunc;
 667 
 668     return OPAL_SUCCESS;
 669 }
 670 
 671 /**
 672  * Allocate control messages or eager frags if BTL does not have
 673  * INPLACE flag.  To be clear: max it will ever alloc is eager_limit.
 674  * THEREFORE: eager_limit is the max that ALLOC must always be able to
 675  * alloc.
 676  *  --> Contraction in the btl.h documentation.
 677  */
 678 static mca_btl_base_descriptor_t*
 679 usnic_alloc(struct mca_btl_base_module_t* btl,
 680               struct mca_btl_base_endpoint_t* endpoint,
 681               uint8_t order,
 682               size_t size,
 683               uint32_t flags)
 684 {
 685     opal_btl_usnic_send_frag_t *frag;
 686     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) btl;
 687     mca_btl_base_descriptor_t *desc;
 688 
 689 
 690     /* small is easy, just allocate a small segment */
 691     if (OPAL_LIKELY(size <= module->max_frag_payload)) {
 692         opal_btl_usnic_small_send_frag_t *sfrag;
 693 
 694         sfrag = opal_btl_usnic_small_send_frag_alloc(module);
 695         if (NULL == sfrag) {
 696             return NULL;
 697         }
 698         frag = &sfrag->ssf_base;
 699 
 700     /* between MTU and eager limit, we need to allocate a buffer
 701      * which can hold the data.  We will allocate a
 702      * large fragment, and attach the buffer to it.
 703      */
 704     } else {
 705         opal_btl_usnic_large_send_frag_t *lfrag;
 706 
 707         /* truncate to eager_limit */
 708         if (OPAL_UNLIKELY(size > module->super.btl_eager_limit)) {
 709             size = module->super.btl_eager_limit;
 710         }
 711 
 712         lfrag = opal_btl_usnic_large_send_frag_alloc(module);
 713         if (OPAL_UNLIKELY(NULL == lfrag)) {
 714             return NULL;
 715         }
 716         frag = &lfrag->lsf_base;
 717 
 718         assert(size > 0);
 719         lfrag->lsf_buffer = malloc(size);
 720         if (OPAL_UNLIKELY(NULL == lfrag->lsf_buffer)) {
 721             opal_btl_usnic_frag_return(module, &lfrag->lsf_base.sf_base);
 722             return NULL;
 723         }
 724 
 725         /* pointer to buffer for caller */
 726         frag->sf_base.uf_base.USNIC_SEND_LOCAL[0].seg_addr.pval =
 727             lfrag->lsf_buffer;
 728 
 729         MSGDEBUG1_OUT("usnic_alloc: packing frag %p on the fly", (void *)frag);
 730         lfrag->lsf_pack_on_the_fly = true;
 731     }
 732 
 733 #if MSGDEBUG2
 734         opal_output(0, "usnic_alloc: %s frag=%p, size=%d, flags=0x%x\n",
 735                 (size <= module->max_frag_payload)?"small":"large",
 736                 (void *)frag, (int)size, flags);
 737 #endif
 738 
 739     /* set endpoint */
 740     frag->sf_endpoint = endpoint;
 741 
 742     /* set up descriptor */
 743     desc = &frag->sf_base.uf_base;
 744     desc->des_flags = flags;
 745     desc->USNIC_SEND_LOCAL[0].seg_len = size;
 746     desc->USNIC_SEND_LOCAL_COUNT = 1;
 747 
 748     return desc;
 749 }
 750 
 751 
 752 /**
 753  * Return an allocated fragment
 754  *
 755  * Return the send fragment to the appropriate list
 756  */
 757 static int usnic_free(struct mca_btl_base_module_t* btl,
 758                         mca_btl_base_descriptor_t* des)
 759 {
 760     opal_btl_usnic_frag_t* frag = (opal_btl_usnic_frag_t*)des;
 761 
 762 #if MSGDEBUG2
 763     opal_output(0, "usnic_free: %p (%s)\n", (void*)frag,
 764             usnic_frag_type(frag->uf_type));
 765 #endif
 766     /* calling free routine gives us ownership - we need to make sure
 767      * the flag is set for lower layers.
 768      */
 769     frag->uf_base.des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
 770 
 771     opal_btl_usnic_frag_return_cond((struct opal_btl_usnic_module_t *)btl,
 772             frag);
 773 
 774     return OPAL_SUCCESS;
 775 }
 776 
 777 /* Packs data from the given large send frag into single new segment and
 778  * returns a pointer to it.  The packed data comes first from SG[0] (PML
 779  * header) and then second from either SG[1] (if seg_addr is non-NULL) or from
 780  * the convertor contained in the frag.
 781  *
 782  * The frag's bookkeeping data will be updated appropriately. */
 783 static
 784 opal_btl_usnic_chunk_segment_t *
 785 pack_chunk_seg_from_frag(
 786     struct opal_btl_usnic_module_t* module,
 787     opal_btl_usnic_large_send_frag_t *lfrag)
 788 {
 789     opal_btl_usnic_chunk_segment_t *seg;
 790     uint8_t *copyptr;
 791     size_t copylen;
 792     size_t seg_space;
 793     size_t max_data;
 794 
 795     assert(NULL != lfrag);
 796     /* never should be attempting to pack if we've already packed everything */
 797     assert(lfrag->lsf_pack_bytes_left > 0);
 798 
 799     seg = opal_btl_usnic_chunk_segment_alloc(module);
 800     if (OPAL_UNLIKELY(NULL == seg)) {
 801         /* TODO look at ways to deal with this case more gracefully, possibly as
 802          * part of capping the overall BTL memory consumption.  Watch out for
 803          * possible MPI-layer deadlock. */
 804         opal_btl_usnic_util_abort("chunk segment allocation error",
 805                                   __FILE__, __LINE__);
 806     }
 807 
 808     seg_space = module->max_chunk_payload;
 809     copyptr = seg->ss_base.us_payload.raw;
 810 
 811     /* Keep copying in as long as we have space, there is data to be copied, and
 812      * we aren't using a convertor (SG[1] will be NULL if we have a convertor).
 813      */
 814     while (seg_space > 0 &&
 815            lfrag->lsf_pack_bytes_left > 0 &&
 816            NULL != lfrag->lsf_cur_ptr) {
 817         if (seg_space > lfrag->lsf_bytes_left_in_sge) {
 818             copylen = lfrag->lsf_bytes_left_in_sge;
 819         } else {
 820             copylen = seg_space;
 821         }
 822 
 823         memcpy(copyptr, lfrag->lsf_cur_ptr, copylen);
 824         seg_space -= copylen;
 825         copyptr += copylen;
 826         lfrag->lsf_bytes_left_in_sge -= copylen;
 827         lfrag->lsf_pack_bytes_left -= copylen;
 828         if (lfrag->lsf_bytes_left_in_sge > 0) {
 829             lfrag->lsf_cur_ptr += copylen;
 830         } else {
 831             ++lfrag->lsf_cur_sge;
 832             lfrag->lsf_cur_ptr =
 833                 lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_addr.pval;
 834             lfrag->lsf_bytes_left_in_sge =
 835                 lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_len;
 836         }
 837     }
 838 
 839     if (seg_space > 0 && lfrag->lsf_pack_bytes_left > 0) {
 840         /* the remaining bytes come from a convertor; pack using it */
 841         assert(NULL == lfrag->lsf_cur_ptr);
 842         assert(1 == lfrag->lsf_cur_sge);
 843 
 844         copylen = lfrag->lsf_pack_bytes_left;
 845         if (copylen > seg_space) {
 846             copylen = seg_space;
 847         }
 848         usnic_convertor_pack_simple(&lfrag->lsf_base.sf_convertor, copyptr,
 849                                     copylen, &max_data);
 850         seg_space -= max_data;
 851         lfrag->lsf_bytes_left_in_sge -= max_data;
 852         lfrag->lsf_pack_bytes_left -= max_data;
 853     }
 854 
 855     MSGDEBUG1_OUT("%s: packed seg=%p, frag=%p, payload=%zd\n",
 856                   __func__, (void *)seg, (void *)lfrag,
 857                   (module->max_chunk_payload - seg_space));
 858 
 859     assert(lfrag->lsf_cur_sge <= 2);
 860     assert(seg_space < module->max_chunk_payload); /* must make progress */
 861 
 862     seg->ss_parent_frag = &lfrag->lsf_base;
 863     seg->ss_len = module->max_chunk_payload - seg_space;
 864 
 865     return seg;
 866 }
 867 
 868 static int usnic_finalize(struct mca_btl_base_module_t* btl)
 869 {
 870     opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*)btl;
 871 
 872     if (module->device_async_event_active) {
 873         opal_event_del(&(module->device_async_event));
 874         module->device_async_event_active = false;
 875     }
 876 
 877     opal_btl_usnic_connectivity_unlisten(module);
 878 
 879     finalize_one_channel(module,
 880                          &module->mod_channels[USNIC_DATA_CHANNEL]);
 881     finalize_one_channel(module,
 882                          &module->mod_channels[USNIC_PRIORITY_CHANNEL]);
 883 
 884     /* Shutdown the stats on this module */
 885     opal_btl_usnic_stats_finalize(module);
 886 
 887     /* Note that usnic_del_procs will have been called for *all* procs
 888        by this point, so the module->all_endpoints list will be empty.
 889        Destruct it. */
 890     opal_mutex_lock(&module->all_endpoints_lock);
 891     OBJ_DESTRUCT(&(module->all_endpoints));
 892     module->all_endpoints_constructed = false;
 893     opal_mutex_unlock(&module->all_endpoints_lock);
 894 
 895     /* _flush_endpoint should have emptied this list */
 896     assert(opal_list_is_empty(&(module->pending_resend_segs)));
 897     OBJ_DESTRUCT(&module->pending_resend_segs);
 898 
 899     /* Similarly, empty the endpoints_that_need_acks list so that
 900        endpoints don't still have an endpoint_ack_li item still in
 901        use */
 902     while (!opal_list_is_empty(&(module->endpoints_that_need_acks))) {
 903         (void) opal_list_remove_first(&(module->endpoints_that_need_acks));
 904     }
 905     OBJ_DESTRUCT(&module->endpoints_that_need_acks);
 906 
 907     /* Note that usnic_del_procs will have been called for *all* procs
 908        by this point, so the module->all_procs list will be empty.
 909        Destruct it. */
 910     OBJ_DESTRUCT(&module->all_procs);
 911 
 912     for (int i = module->first_pool; i <= module->last_pool; ++i) {
 913         OBJ_DESTRUCT(&module->module_recv_buffers[i]);
 914     }
 915     free(module->module_recv_buffers);
 916 
 917     OBJ_DESTRUCT(&module->ack_segs);
 918     OBJ_DESTRUCT(&module->endpoints_with_sends);
 919     OBJ_DESTRUCT(&module->small_send_frags);
 920     OBJ_DESTRUCT(&module->large_send_frags);
 921     OBJ_DESTRUCT(&module->put_dest_frags);
 922     OBJ_DESTRUCT(&module->chunk_segs);
 923     OBJ_DESTRUCT(&module->senders);
 924 
 925     mca_rcache_base_module_destroy(module->rcache);
 926 
 927     if (NULL != module->av) {
 928         fi_close(&module->av->fid);
 929     }
 930     if (NULL != module->av_eq) {
 931         fi_close(&module->av_eq->fid);
 932     }
 933     if (NULL != module->dom_eq) {
 934         fi_close(&module->dom_eq->fid);
 935     }
 936     fi_close(&module->domain->fid);
 937     fi_close(&module->fabric->fid);
 938 
 939     free(module->linux_device_name);
 940 
 941     return OPAL_SUCCESS;
 942 }
 943 
 944 static inline unsigned
 945 get_send_credits(struct opal_btl_usnic_channel_t *chan)
 946 {
 947     return chan->credits;
 948 }
 949 
 950 static void
 951 usnic_do_resends(
 952     opal_btl_usnic_module_t *module)
 953 {
 954     opal_btl_usnic_send_segment_t *sseg;
 955     opal_btl_usnic_endpoint_t *endpoint;
 956     struct opal_btl_usnic_channel_t *data_channel;
 957     int ret;
 958 
 959     data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
 960 
 961     while ((get_send_credits(data_channel) > 1) &&
 962            !opal_list_is_empty(&module->pending_resend_segs)) {
 963 
 964         /*
 965          * If a segment is on the re-send list, it will not
 966          * be in the retransmit hotel.  Post the segment, then check it in.
 967          */
 968         sseg = (opal_btl_usnic_send_segment_t *)
 969             opal_list_remove_first(&module->pending_resend_segs);
 970         endpoint = sseg->ss_parent_frag->sf_endpoint;
 971 
 972         /* clobber any stale piggy-backed ACK */
 973         sseg->ss_base.us_btl_header->ack_present = 0;
 974 
 975         /* Only post this segment if not already posted */
 976         if (sseg->ss_send_posted == 0) {
 977 
 978             /* resends are always standard segments */
 979             sseg->ss_channel = USNIC_DATA_CHANNEL;
 980 
 981             /* re-send the segment (we have a send credit available) */
 982             opal_btl_usnic_post_segment(module, endpoint, sseg);
 983 
 984             /* consume a send credit for this endpoint.  May send us
 985              * negative, oh well...  This is because the completion routine
 986              * always increments send credits, and we must balance.
 987              * Alternative is to mark this as a retrans segment and check in
 988              * completion, but this ugly way avoids extra checks in the
 989              * critical path.  And, really, respects the concept of send
 990              * credits more.
 991              */
 992             --endpoint->endpoint_send_credits;
 993             ++module->stats.num_resends;
 994         }
 995 
 996         /* restart the retrans timer */
 997         ret = opal_hotel_checkin(&endpoint->endpoint_hotel,
 998                 sseg, &sseg->ss_hotel_room);
 999         if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
1000             opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__);
1001         }
1002     }
1003 }
1004 
1005 /* Given a large send frag (which is at the head of the given endpoint's send
1006  * queue), generate a new segment, fill it with data, and
1007  * endpoint_send_segment() it.  Takes care of subsequent frag
1008  * cleanup/bookkeeping (dequeue, descriptor callback, etc.) if this frag was
1009  * completed by this segment.
1010  *
1011  * ASSUMES THAT THE CALLER HAS ALREADY CHECKED TO SEE IF WE HAVE
1012  * A SEND CREDIT!
1013  */
1014 static void
1015 usnic_handle_large_send(
1016     opal_btl_usnic_module_t *module,
1017     opal_btl_usnic_endpoint_t *endpoint,
1018     opal_btl_usnic_send_frag_t *frag)
1019 {
1020     opal_btl_usnic_large_send_frag_t *lfrag;
1021     opal_btl_usnic_btl_chunk_header_t *chp;
1022     opal_btl_usnic_send_segment_t *sseg;
1023     size_t payload_len;
1024 
1025     assert(frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_LARGE_SEND);
1026     lfrag = (opal_btl_usnic_large_send_frag_t *)frag;
1027     if (lfrag->lsf_cur_offset == 0) {
1028         /* assign a fragment ID */
1029         do {
1030             lfrag->lsf_frag_id = endpoint->endpoint_next_frag_id++;
1031         } while (lfrag->lsf_frag_id == 0);
1032     }
1033 
1034     if (lfrag->lsf_pack_on_the_fly) {
1035         assert(opal_list_is_empty(&lfrag->lsf_seg_chain));
1036 
1037         /* just pack a single chunk segment and put it on the list */
1038         sseg = pack_chunk_seg_from_frag(module, lfrag);
1039     } else {
1040         /* data was pre-packed in prepare_src */
1041         sseg = (opal_btl_usnic_send_segment_t *)
1042             opal_list_remove_first(&lfrag->lsf_seg_chain);
1043     }
1044 
1045     assert(NULL != sseg);
1046     payload_len = sseg->ss_len;
1047 
1048     assert(payload_len > 0); /* must have made progress */
1049     assert(payload_len <= module->max_chunk_payload);
1050     assert(lfrag->lsf_bytes_left >= payload_len);
1051 
1052     /* set actual packet length */
1053     sseg->ss_len = sizeof(opal_btl_usnic_btl_chunk_header_t) + payload_len;
1054     lfrag->lsf_bytes_left -= payload_len;
1055 
1056     /* fill in the chunk's BTL header with frag info */
1057     chp = sseg->ss_base.us_btl_chunk_header;
1058     chp->ch_frag_id = lfrag->lsf_frag_id;
1059     chp->ch_frag_size = lfrag->lsf_base.sf_size;
1060     chp->ch_frag_offset = lfrag->lsf_cur_offset;
1061     chp->ch_hdr.tag = lfrag->lsf_tag;
1062 
1063     /* payload length into the header*/
1064     sseg->ss_base.us_btl_header->payload_len = payload_len;
1065 
1066     // We assume that the caller has checked to see that we have a
1067     // send credit, so do the send.
1068     opal_btl_usnic_endpoint_send_segment(module, sseg);
1069 
1070     /* do fragment bookkeeping */
1071     lfrag->lsf_cur_offset += payload_len;
1072 
1073 #if MSGDEBUG1
1074     opal_output(0, "%s: payload_len=%zd, bytes_left=%zd on_the_fly=%s\n",
1075                 __func__, payload_len, lfrag->lsf_bytes_left,
1076                 lfrag->lsf_pack_on_the_fly?"true":"false");
1077 #endif
1078     /* done with fragment? */
1079     if (lfrag->lsf_bytes_left == 0) {
1080 
1081         /* remove this frag from sending list now because upper layer may
1082          * decide to put it on some other list in the callback
1083          */
1084         opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
1085                 &frag->sf_base.uf_base.super.super);
1086 
1087         /* only callback now if this was not a PUT and we own the fragment,
1088          * otherwise we need to wait until last byte is ACKed
1089          */
1090         if (frag->sf_base.uf_remote_seg[0].seg_addr.pval == NULL &&
1091                 (frag->sf_base.uf_base.des_flags &
1092                  MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
1093 
1094             OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "large");
1095         }
1096     }
1097 }
1098 
1099 /*
1100  * Progress the send engine.
1101  * Should only ever be called from usnic_component_progress() to
1102  * avoid re-entrancy issues.
1103  */
1104 void
1105 opal_btl_usnic_module_progress_sends(
1106     opal_btl_usnic_module_t *module)
1107 {
1108     opal_btl_usnic_send_frag_t *frag;
1109     opal_btl_usnic_send_segment_t *sseg;
1110     opal_btl_usnic_endpoint_t *endpoint;
1111     struct opal_btl_usnic_channel_t *data_channel;
1112     struct opal_btl_usnic_channel_t *prio_channel;
1113 
1114     /*
1115      * Post all the sends that we can
1116      * resends 1st priority
1117      * ACKs 2nd priority
1118      * new sends 3rd
1119      */
1120     data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
1121     prio_channel = &module->mod_channels[USNIC_PRIORITY_CHANNEL];
1122 
1123     /*
1124      * Handle all the retransmits we can
1125      */
1126     OPAL_THREAD_LOCK(&btl_usnic_lock);
1127     if (OPAL_UNLIKELY(!opal_list_is_empty(&module->pending_resend_segs))) {
1128         usnic_do_resends(module);
1129     }
1130 
1131     /*
1132      * Keep sending as long as there are WQEs and something to do
1133      */
1134     while ((get_send_credits(data_channel) > 1) &&
1135            !opal_list_is_empty(&module->endpoints_with_sends)) {
1136         opal_btl_usnic_small_send_frag_t *sfrag;
1137         size_t payload_len;
1138 
1139         /*
1140          * Grab the first endpoint with a pending send.  Presence on this
1141          * list means there is a fragment with data ready to go and
1142          * the endpoint's send window is open, and the endpoint has send
1143          * credits.
1144          */
1145 
1146         endpoint = (opal_btl_usnic_endpoint_t *)
1147             opal_list_get_first(&module->endpoints_with_sends);
1148         frag = (opal_btl_usnic_send_frag_t *)
1149             opal_list_get_first(&endpoint->endpoint_frag_send_queue);
1150 
1151         /*
1152          * small send?  (fragment fits in one segment)
1153          * Send ptr and length will be in uf_local_seg[0]
1154          */
1155         if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND) {
1156 
1157             /* remove this frag from sending list now because upper layer may
1158              * decide to put it on some other list in the callback
1159              */
1160             opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
1161                     &frag->sf_base.uf_base.super.super);
1162 
1163             sfrag = (opal_btl_usnic_small_send_frag_t *)frag;
1164             sseg = &sfrag->ssf_segment;
1165 
1166             /* get payload len from segment */
1167             payload_len = sfrag->ssf_base.sf_size;
1168             sseg->ss_base.us_btl_header->payload_len = payload_len;
1169 
1170 #if MSGDEBUG1
1171             opal_output(0, "progress send small, frag=%p, ptr=%p, payload=%zd, len=%"PRIu32", ep=%p, tag=%d\n",
1172                     (void *)frag,
1173                     (void *)sseg->ss_ptr, payload_len,
1174                     sseg->ss_len,
1175                     (void *)frag->sf_endpoint,
1176                     sseg->ss_base.us_btl_header->tag);
1177 #endif
1178 
1179             /* post the send (we have a send credit available) */
1180             opal_btl_usnic_endpoint_send_segment(module, sseg);
1181 
1182             /* don't do callback yet if this is a put */
1183             if (frag->sf_base.uf_remote_seg[0].seg_addr.pval == NULL) {
1184                 /* we have copied the data, perform a callback if
1185                  * we own the fragment and callback is requested.
1186                  * If we don't own the fragment, we cannot callback yet
1187                  * because we are not done with the segment inside.
1188                  * (ACK not received yet)
1189                  */
1190                 if ((frag->sf_base.uf_base.des_flags &
1191                         (MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
1192                          MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) ==
1193                         (MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
1194                          MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
1195                     OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "small");
1196                 }
1197             }
1198 
1199         /* Large sends... */
1200         } else {
1201             usnic_handle_large_send(module, endpoint, frag);
1202         }
1203 
1204         /* If no more sends or endpoint send window is closed,
1205          * or no more send credits, remove from send list
1206          */
1207         if (opal_list_is_empty(&endpoint->endpoint_frag_send_queue) ||
1208             endpoint->endpoint_send_credits <= 0 ||
1209             !WINDOW_OPEN(endpoint)) {
1210 
1211             opal_list_remove_item(&module->endpoints_with_sends,
1212                     &endpoint->super);
1213             endpoint->endpoint_ready_to_send = false;
1214         }
1215     }
1216 
1217     /*
1218      * Handle any ACKs that need to be sent
1219      */
1220     endpoint = opal_btl_usnic_get_first_endpoint_needing_ack(module);
1221     while (get_send_credits(prio_channel) > 1 && endpoint != NULL) {
1222         opal_btl_usnic_endpoint_t *next_endpoint;
1223 
1224         /* get next in list */
1225         next_endpoint = opal_btl_usnic_get_next_endpoint_needing_ack(endpoint);
1226 
1227         /* Is it time to send ACK? */
1228         if (endpoint->endpoint_acktime == 0 ||
1229             endpoint->endpoint_acktime <= get_nsec()) {
1230             if (OPAL_LIKELY(opal_btl_usnic_ack_send(module, endpoint) == OPAL_SUCCESS)) {
1231                 opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint);
1232             } else {
1233                 // If we fail, it means we're out of send credits on
1234                 // the ACK channel
1235                 break;
1236             }
1237         }
1238 
1239         endpoint = next_endpoint;
1240     }
1241     OPAL_THREAD_UNLOCK(&btl_usnic_lock);
1242 }
1243 
1244 /*
1245  *  Initiate a send.
1246  *
1247  *  Send completion callbacks can be done from a few different places.
1248  *
1249  *  If this is a send from a fragment we do not own, we always have
1250  *  to wait for the last ACK of the fragment, because we cannot allow
1251  *  the fragment to be re-used until we know we have no more retransmits to do.
1252  *
1253  *  If this is a send from a fragment we own, and we know we have copied the
1254  *  data from the user's buffer, we can perform the callback immediately
1255  *  (or possibly not at all, simply returning "1" to indicate completion).
1256  *
1257  *  If this is a send from a fragment we own and we have not yet copied out
1258  *  all the data (as is the case in a large send) then we defer the callback
1259  *  until the last of the data has been copied out by routines called
1260  *  from opal_btl_usnic_progress_sends()
1261  */
1262 static int
1263 usnic_send(
1264     struct mca_btl_base_module_t* base_module,
1265     struct mca_btl_base_endpoint_t* base_endpoint,
1266     struct mca_btl_base_descriptor_t* descriptor,
1267     mca_btl_base_tag_t tag)
1268 {
1269     int rc;
1270     opal_btl_usnic_send_frag_t *frag;
1271     opal_btl_usnic_small_send_frag_t *sfrag;
1272     opal_btl_usnic_endpoint_t *endpoint;
1273     opal_btl_usnic_module_t *module;
1274     opal_btl_usnic_send_segment_t *sseg;
1275 
1276     OPAL_THREAD_LOCK(&btl_usnic_lock);
1277     endpoint = (opal_btl_usnic_endpoint_t *)base_endpoint;
1278     module = (opal_btl_usnic_module_t *)base_module;
1279     frag = (opal_btl_usnic_send_frag_t*) descriptor;
1280 
1281     assert(frag->sf_endpoint == endpoint);
1282     frag->sf_base.uf_remote_seg[0].seg_addr.pval = NULL;      /* not a PUT */
1283 
1284     opal_btl_usnic_compute_sf_size(frag);
1285     frag->sf_ack_bytes_left = frag->sf_size;
1286 
1287 #if MSGDEBUG2
1288     opal_output(0, "usnic_send: frag=%p, endpoint=%p, tag=%d, sf_size=%d\n",
1289             (void *)frag, (void *)endpoint,
1290             tag, (int)frag->sf_size);
1291 #if MSGDEBUG1
1292     { unsigned i;
1293         opal_output(0, "  descriptor->des_flags=0x%x\n", descriptor->des_flags);
1294         for (i=0; i<descriptor->USNIC_SEND_LOCAL_COUNT; ++i) {
1295             opal_output(0, "  %d: ptr:%p len:%d\n", i,
1296                     descriptor->USNIC_SEND_LOCAL[i].seg_addr.pval,
1297                     descriptor->USNIC_SEND_LOCAL[i].seg_len);
1298         }
1299     }
1300 #endif
1301 #endif
1302 
1303     /*
1304      * If this fragment is small enough to inline,
1305      * and we have enough send WQEs,
1306      * then inline and fastpath it
1307      */
1308     if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND &&
1309             frag->sf_ack_bytes_left < module->max_tiny_payload &&
1310             WINDOW_OPEN(endpoint) &&
1311             (get_send_credits(&module->mod_channels[USNIC_DATA_CHANNEL]) >=
1312              module->mod_channels[USNIC_DATA_CHANNEL].fastsend_wqe_thresh)) {
1313         size_t payload_len;
1314 
1315         sfrag = (opal_btl_usnic_small_send_frag_t *)frag;
1316         sseg = &sfrag->ssf_segment;
1317 
1318         payload_len = frag->sf_ack_bytes_left;
1319         sseg->ss_base.us_btl_header->payload_len = payload_len;
1320 
1321 
1322         /* copy the 2nd SGE into the segment */
1323         if (frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT > 1) {
1324             memcpy(((char *)(intptr_t)frag->sf_base.uf_local_seg[0].seg_addr.lval +
1325                      frag->sf_base.uf_local_seg[0].seg_len),
1326                     frag->sf_base.uf_local_seg[1].seg_addr.pval,
1327                     frag->sf_base.uf_local_seg[1].seg_len);
1328 
1329             /* update 1st segment length */
1330             frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
1331             frag->sf_base.uf_local_seg[0].seg_len +=
1332                 frag->sf_base.uf_local_seg[1].seg_len;
1333         }
1334 
1335         /* assign length */
1336         sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size;
1337 
1338         sseg->ss_channel = USNIC_DATA_CHANNEL;
1339         sseg->ss_base.us_btl_header->tag = tag;
1340 #if MSGDEBUG1
1341         opal_output(0, "INLINE send, sseg=%p", (void *)sseg);
1342 #endif
1343 
1344         /* post the segment now (we have a send credit available) */
1345         opal_btl_usnic_endpoint_send_segment(module, sseg);
1346 
1347         /* If we own the frag and callback was requested, callback now,
1348          * else just return 1 to show completion.
1349          * If we don't own the frag, need to wait for ACK before
1350          * performing callback on the frag
1351          */
1352         if (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
1353             if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
1354                 OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "immediate small");
1355                 rc = 0;
1356             } else {
1357 #if MSGDEBUG1
1358                 opal_output(0, "skipping callback for frag %p, returning 1\n", (void *)frag);
1359 #endif
1360                 rc = 1;
1361                 ++module->stats.pml_send_callbacks;   /* returning "1" is an implicit CB */
1362             }
1363         } else {
1364 #if MSGDEBUG1
1365             opal_output(0, "don't own descriptor, defer callback for frag %p\n", (void *)frag);
1366 #endif
1367             descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
1368             rc = 0;
1369         }
1370     } else {
1371         /*
1372          * We move this off to another function because having it inside
1373          * this function seems to add a little latency, likely due to inlines
1374          * making the function too big.  In fact, the routine had to go to
1375          * another file entirely, else the compiler tried to be helpful
1376          * and inline all by itself.
1377          */
1378         rc = opal_btl_usnic_finish_put_or_send(module, endpoint, frag, tag);
1379         /* FIXME can we clarify flag set/clear ordering? */
1380         frag->sf_base.uf_base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
1381     }
1382 
1383     ++module->stats.pml_module_sends;
1384 
1385     OPAL_THREAD_UNLOCK(&btl_usnic_lock);
1386     return rc;
1387 }
1388 
1389 #if 0
1390 /*
1391  * Initiate an immediate send
1392  */
1393 static int usnic_sendi(struct mca_btl_base_module_t* btl,
1394                          struct mca_btl_base_endpoint_t* endpoint,
1395                          struct opal_convertor_t* convertor,
1396                          void* header,
1397                          size_t header_size,
1398                          size_t payload_size,
1399                          uint8_t order,
1400                          uint32_t flags,
1401                          mca_btl_base_tag_t tag,
1402                          mca_btl_base_descriptor_t** descriptor)
1403 {
1404     /* JMS write me */
1405     return OPAL_ERROR;
1406 }
1407 #endif
1408 
1409 
1410 /*
1411  * RDMA Memory Pool (de)register callbacks
1412  */
1413 static int usnic_reg_mr(void* reg_data, void* base, size_t size,
1414                         mca_rcache_base_registration_t* reg)
1415 {
1416     opal_btl_usnic_module_t* mod = (opal_btl_usnic_module_t*)reg_data;
1417     opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
1418     int rc;
1419 
1420     rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ur->ur_mr, NULL);
1421     if (0 != rc) {
1422         return OPAL_ERR_OUT_OF_RESOURCE;
1423     }
1424 
1425     return OPAL_SUCCESS;
1426 }
1427 
1428 static int usnic_dereg_mr(void* reg_data,
1429                           mca_rcache_base_registration_t* reg)
1430 {
1431     opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
1432 
1433     if (ur->ur_mr != NULL) {
1434         if (0 != fi_close(&ur->ur_mr->fid)) {
1435             opal_output(0, "%s: error unpinning USD memory mr=%p: %s\n",
1436                         __func__, (void*) ur->ur_mr, strerror(errno));
1437             return OPAL_ERROR;
1438         }
1439     }
1440 
1441     ur->ur_mr = NULL;
1442     return OPAL_SUCCESS;
1443 }
1444 
1445 
1446 /*
1447  * Called back by libevent if an async event occurs on the device
1448  */
1449 static void module_async_event_callback(int fd, short flags, void *arg)
1450 {
1451     char *str = NULL;
1452     bool fatal = false;
1453     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) arg;
1454     uint32_t event;
1455     struct fi_eq_entry entry;
1456 
1457     /* Get the async event */
1458     int ret = fi_eq_read(module->dom_eq, &event, &entry, sizeof(entry), 0);
1459     if (-FI_EAGAIN == ret) {
1460         /* Nothing to see here... */
1461         return;
1462     }
1463 
1464     else if (ret != 0) {
1465         opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
1466                        true,
1467                        opal_process_info.nodename,
1468                        module->linux_device_name,
1469                        "fi_eq_read()", __FILE__, __LINE__,
1470                        ret,
1471                        "Failed to get domain event");
1472         fatal = true;
1473     }
1474 
1475     else if (event == 42 /* RFXXX FI_LINKSTATE */) {
1476         opal_memchecker_base_mem_defined(&event, sizeof(event));
1477         opal_memchecker_base_mem_defined(&entry, sizeof(entry));
1478         switch (entry.data) {
1479             case 0: // USD_EVENT_LINK_UP:
1480             /* This event should never happen, because OMPI will
1481                ignore ports that are down, and we should only get this
1482                event if a port *was* down and is now *up*.  But if we
1483                ever do get it, it should be a harmless event -- just
1484                ignore it. */
1485             opal_output_verbose(10, USNIC_OUT,
1486                                 "btl:usnic: got LINK_UP on %s",
1487                                 module->linux_device_name);
1488             break;
1489 
1490             case 1: // USD_EVENT_LINK_DOWN:
1491             str = "link down";
1492             /* Fall through */
1493 
1494         default:
1495             if (NULL == str) {
1496                 str = "unknown event";
1497             }
1498 
1499             /* For the moment, these are the only other cases libfabric
1500                will report to us.  However, they're only listed here
1501                for completeness.  We currently abort if any async
1502                event other than LINK_UP occurs. */
1503             opal_show_help("help-mpi-btl-usnic.txt", "async event",
1504                            true,
1505                            opal_process_info.nodename,
1506                            module->linux_device_name,
1507                            str, entry.data);
1508             fatal = true;
1509         }
1510     }
1511 
1512     /* If this is fatal, invoke the upper layer error handler to abort
1513        the job */
1514     if (fatal) {
1515         opal_btl_usnic_exit(module);
1516         /* Does not return */
1517     }
1518 }
1519 
1520 /*
1521  * Create a single libfabric endpoint
1522  */
1523 static int create_ep(opal_btl_usnic_module_t* module,
1524                             struct opal_btl_usnic_channel_t *channel)
1525 {
1526     int rc;
1527     struct sockaddr_in *sin;
1528     size_t addrlen;
1529     struct fi_info *hint;
1530 
1531     hint = fi_dupinfo(module->fabric_info);
1532     if (NULL == hint) {
1533         opal_show_help("help-mpi-btl-usnic.txt",
1534                        "internal error during init",
1535                        true,
1536                        opal_process_info.nodename,
1537                        module->linux_device_name,
1538                        "fi_dupinfo() failed", __FILE__, __LINE__,
1539                        -1, "Unknown");
1540         return OPAL_ERR_OUT_OF_RESOURCE;
1541     }
1542 
1543     hint->rx_attr->size = channel->chan_rd_num;
1544     hint->tx_attr->size = channel->chan_sd_num;
1545 
1546     /* specific ports requested? */
1547     sin = hint->src_addr;
1548     if (0 == mca_btl_usnic_component.udp_port_base) {
1549         sin->sin_port = 0;
1550     } else {
1551         sin->sin_port = htons(mca_btl_usnic_component.udp_port_base +
1552             opal_process_info.my_local_rank);
1553     }
1554 
1555     rc = fi_getinfo(module->libfabric_api, NULL, 0, 0, hint, &channel->info);
1556     fi_freeinfo(hint);
1557     if (0 != rc) {
1558         opal_show_help("help-mpi-btl-usnic.txt",
1559                        "internal error during init",
1560                        true,
1561                        opal_process_info.nodename,
1562                        module->linux_device_name,
1563                        "fi_getinfo() failed", __FILE__, __LINE__,
1564                        rc, fi_strerror(-rc));
1565         return OPAL_ERR_OUT_OF_RESOURCE;
1566     }
1567     if (channel->chan_index != USNIC_PRIORITY_CHANNEL) {
1568         channel->info->caps &= ~(1ULL << 63);
1569     }
1570 
1571     /* This #if prevents compiler warnings about sa being assigned and
1572        not used when NDEBUG is defined */
1573 #if !defined(NDEBUG)
1574     /* all of the OMPI code assumes IPv4, but some versions of libfabric will
1575      * return FI_SOCKADDR instead of FI_SOCKADDR_IN, so we need to do a little
1576      * bit of sanity checking */
1577     assert(FI_SOCKADDR_IN == channel->info->addr_format ||
1578            FI_SOCKADDR == channel->info->addr_format);
1579     if (FI_SOCKADDR == channel->info->addr_format) {
1580         struct sockaddr *sa;
1581         sa = (struct sockaddr *)channel->info->src_addr;
1582         assert(AF_INET == sa->sa_family);
1583     }
1584 #endif
1585 
1586     sin = (struct sockaddr_in *)channel->info->src_addr;
1587     assert(sizeof(struct sockaddr_in) == channel->info->src_addrlen);
1588 
1589     /* no matter the version of libfabric, this should hold */
1590     assert(0 == sin->sin_port);
1591 
1592     rc = fi_endpoint(module->domain, channel->info, &channel->ep, NULL);
1593     if (0 != rc || NULL == channel->ep) {
1594         opal_show_help("help-mpi-btl-usnic.txt",
1595                        "internal error during init",
1596                        true,
1597                        opal_process_info.nodename,
1598                        module->linux_device_name,
1599                        "fi_endpoint() failed", __FILE__, __LINE__,
1600                        rc, fi_strerror(-rc));
1601         return OPAL_ERR_OUT_OF_RESOURCE;
1602     }
1603 
1604     /* Check to ensure that the RX/TX queue lengths are at least as
1605        long as we asked for */
1606     if ((int) channel->info->rx_attr->size < channel->chan_rd_num) {
1607         rc = FI_ETOOSMALL;
1608         opal_show_help("help-mpi-btl-usnic.txt",
1609                        "internal error during init",
1610                        true,
1611                        opal_process_info.nodename,
1612                        module->linux_device_name,
1613                        "endpoint RX queue length is too short", __FILE__, __LINE__,
1614                        rc, fi_strerror(rc));
1615         return OPAL_ERR_OUT_OF_RESOURCE;
1616     }
1617     if ((int) channel->info->tx_attr->size < channel->chan_sd_num) {
1618         rc = FI_ETOOSMALL;
1619         opal_show_help("help-mpi-btl-usnic.txt",
1620                        "internal error during init",
1621                        true,
1622                        opal_process_info.nodename,
1623                        module->linux_device_name,
1624                        "endpoint TX queue length is too short", __FILE__, __LINE__,
1625                        rc, fi_strerror(rc));
1626         return OPAL_ERR_OUT_OF_RESOURCE;
1627     }
1628 
1629     /* attach CQ to EP */
1630     rc = fi_ep_bind(channel->ep, &channel->cq->fid, FI_SEND);
1631     if (0 != rc) {
1632         opal_show_help("help-mpi-btl-usnic.txt",
1633                        "internal error during init",
1634                        true,
1635                        opal_process_info.nodename,
1636                        module->linux_device_name,
1637                        "fi_ep_bind() SCQ to EP failed", __FILE__, __LINE__,
1638                        rc, fi_strerror(-rc));
1639         return OPAL_ERR_OUT_OF_RESOURCE;
1640     }
1641     rc = fi_ep_bind(channel->ep, &channel->cq->fid, FI_RECV);
1642     if (0 != rc) {
1643         opal_show_help("help-mpi-btl-usnic.txt",
1644                        "internal error during init",
1645                        true,
1646                        opal_process_info.nodename,
1647                        module->linux_device_name,
1648                        "fi_ep_bind() RCQ to EP failed", __FILE__, __LINE__,
1649                        rc, fi_strerror(-rc));
1650         return OPAL_ERR_OUT_OF_RESOURCE;
1651     }
1652     rc = fi_ep_bind(channel->ep, &module->av->fid, 0);
1653     if (0 != rc) {
1654         opal_show_help("help-mpi-btl-usnic.txt",
1655                        "internal error during init",
1656                        true,
1657                        opal_process_info.nodename,
1658                        module->linux_device_name,
1659                        "fi_ep_bind() AV to EP failed", __FILE__, __LINE__,
1660                        rc, fi_strerror(-rc));
1661         return OPAL_ERR_OUT_OF_RESOURCE;
1662     }
1663 
1664     /* Enable the endpoint */
1665     rc = fi_enable(channel->ep);
1666     if (0 != rc) {
1667         opal_show_help("help-mpi-btl-usnic.txt",
1668                        "internal error during init",
1669                        true,
1670                        opal_process_info.nodename,
1671                        module->linux_device_name,
1672                        "fi_enable() failed", __FILE__, __LINE__,
1673                        rc, fi_strerror(-rc));
1674         return OPAL_ERR_OUT_OF_RESOURCE;
1675     }
1676 
1677     /* Immediately after libfabric v1.0 was released, we implemented support
1678      * for fi_getname and changed the behavior of fi_endpoint w.r.t. setting
1679      * the src_addr field of the fi_info struct passed in.  Before the change
1680      * fi_endpoint would set the src_addr field, including the sin_port field
1681      * but calling fi_getname would return -FI_ENOSYS.  Afterwards the address
1682      * would not be touched relative to whatever was set by fi_getinfo.  So we
1683      * must call fi_getname in that case.
1684      */
1685     if (0 == sin->sin_port) {
1686         addrlen = sizeof(struct sockaddr_in);
1687         rc = fi_getname(&channel->ep->fid, channel->info->src_addr, &addrlen);
1688         if (0 != rc) {
1689             opal_show_help("help-mpi-btl-usnic.txt",
1690                            "internal error during init",
1691                            true,
1692                            opal_process_info.nodename,
1693                            module->linux_device_name,
1694                            "fi_getname() failed", __FILE__, __LINE__,
1695                            rc, fi_strerror(-rc));
1696             return OPAL_ERR_OUT_OF_RESOURCE;
1697         }
1698         assert(0 != sin->sin_port);
1699     }
1700 
1701     char *str;
1702     if (USNIC_PRIORITY_CHANNEL == channel->chan_index) {
1703         str = "priority";
1704     } else if (USNIC_DATA_CHANNEL == channel->chan_index) {
1705         str = "data";
1706     } else {
1707         str = "UNKNOWN";
1708     }
1709     opal_output_verbose(15, USNIC_OUT,
1710                         "btl:usnic:create_ep:%s: new usnic local endpoint channel %s: %s:%d",
1711                         module->linux_device_name,
1712                         str,
1713                         inet_ntoa(sin->sin_addr),
1714                         ntohs(sin->sin_port));
1715 
1716     return OPAL_SUCCESS;
1717 }
1718 
1719 
1720 /*
1721  * finalize channel - release all associated resources
1722  */
1723 static void finalize_one_channel(opal_btl_usnic_module_t *module,
1724                                  struct opal_btl_usnic_channel_t *channel)
1725 {
1726     if (NULL != channel->ep) {
1727         fi_close(&channel->ep->fid);
1728         channel->ep = NULL;
1729     }
1730 
1731     /* destroy CQ if created */
1732     if (NULL != channel->cq) {
1733         fi_close(&channel->cq->fid);
1734         channel->cq = NULL;
1735     }
1736 
1737     if (NULL != channel->info) {
1738         fi_freeinfo(channel->info);
1739         channel->info = NULL;
1740     }
1741 
1742     /* gets set right after constructor called, lets us know recv_segs
1743      * have been constructed.  Make sure to wait until queues
1744      * destroyed to destroy the recv_segs */
1745     if (channel->recv_segs.ctx == module) {
1746         assert(NULL == channel->ep && NULL == channel->cq);
1747         OBJ_DESTRUCT(&channel->recv_segs);
1748     }
1749 }
1750 
1751 /*
1752  * Initialize a channel
1753  */
1754 static int init_one_channel(opal_btl_usnic_module_t *module,
1755                             int index,
1756                             int max_msg_size,
1757                             int rd_num,
1758                             int sd_num,
1759                             int cq_num)
1760 {
1761     int i;
1762     int rc;
1763     uint32_t segsize;
1764     opal_btl_usnic_recv_segment_t *rseg;
1765     opal_free_list_item_t* item;
1766     struct opal_btl_usnic_channel_t *channel;
1767     struct fi_cq_attr cq_attr;
1768 
1769     channel = &module->mod_channels[index];
1770     channel->chan_max_msg_size = max_msg_size;
1771     channel->chan_rd_num = rd_num;
1772     channel->chan_sd_num = sd_num;
1773     channel->chan_index = index;
1774     channel->chan_deferred_recv = NULL;
1775     channel->chan_error = false;
1776 
1777     channel->fastsend_wqe_thresh = sd_num - 10;
1778 
1779     channel->credits = sd_num;
1780     channel->rx_post_cnt = 0;
1781 
1782     /* We did math up in component_init() to know that there should be
1783        enough CQs available.  So if create_cq() fails, then either the
1784        memlock limits are too low, or something other than this MPI
1785        job is consuming CQs. */
1786     memset(&cq_attr, 0, sizeof(cq_attr));
1787     cq_attr.format = FI_CQ_FORMAT_CONTEXT;
1788     cq_attr.wait_obj = FI_WAIT_NONE;
1789     cq_attr.size = cq_num;
1790     rc = fi_cq_open(module->domain, &cq_attr, &channel->cq, NULL);
1791     if (0 != rc) {
1792         opal_show_help("help-mpi-btl-usnic.txt",
1793                        "internal error during init",
1794                        true,
1795                        opal_process_info.nodename,
1796                        module->linux_device_name,
1797                        "failed to create CQ", __FILE__, __LINE__,
1798                        rc, fi_strerror(-rc));
1799         goto error;
1800     }
1801 
1802     /* Ensure that we got a CQ that is at least as long as we asked
1803        for */
1804     if ((int) cq_attr.size < cq_num) {
1805         rc = FI_ETOOSMALL;
1806         opal_show_help("help-mpi-btl-usnic.txt",
1807                        "internal error during init",
1808                        true,
1809                        opal_process_info.nodename,
1810                        module->linux_device_name,
1811                        "created CQ is too small", __FILE__, __LINE__,
1812                        rc, fi_strerror(rc));
1813         goto error;
1814     }
1815 
1816     /* Set up the endpoint for this channel */
1817     rc = create_ep(module, channel);
1818     if (OPAL_SUCCESS != rc) {
1819         goto error;
1820     }
1821 
1822     assert(channel->info->ep_attr->msg_prefix_size ==
1823            (uint32_t) mca_btl_usnic_component.transport_header_len);
1824 
1825     opal_output_verbose(15, USNIC_OUT,
1826                         "btl:usnic:init_one_channel:%s: channel %s, rx queue size=%" PRIsize_t ", tx queue size=%" PRIsize_t ", cq size=%" PRIsize_t ", send credits=%d",
1827                         module->linux_device_name,
1828                         (index == USNIC_PRIORITY_CHANNEL) ? "priority" : "data",
1829                         channel->info->rx_attr->size,
1830                         channel->info->tx_attr->size,
1831                         cq_attr.size,
1832                         channel->credits);
1833 
1834     /*
1835      * Initialize pool of receive segments.  Round MTU up to cache
1836      * line size so that each segment is guaranteed to start on a
1837      * cache line boundary.
1838      */
1839     segsize = (max_msg_size + channel->info->ep_attr->msg_prefix_size +
1840             opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
1841     OBJ_CONSTRUCT(&channel->recv_segs, opal_free_list_t);
1842     rc =
1843         usnic_compat_free_list_init(&channel->recv_segs,
1844                                     sizeof(opal_btl_usnic_recv_segment_t) /* frag size */,
1845                                     opal_cache_line_size /* frag alignment */,
1846                                     OBJ_CLASS(opal_btl_usnic_recv_segment_t),
1847                                     segsize /* payload buffer size */,
1848                                     opal_cache_line_size /* payload alignmt */,
1849                                     rd_num /* num erorments to alloc */,
1850                                     rd_num /* max elements to alloc */,
1851                                     rd_num /* num elements per alloc */,
1852                                     module->super.btl_mpool /* mpool for (1.x, 2.0: reg, 2.1+: allocation) */,
1853                                     0 /* mpool reg flags */,
1854                                     module->rcache /* registration cache for 2.1+ */,
1855                                     NULL /* item_init */,
1856                                     NULL /* item_init_context */);
1857     channel->recv_segs.ctx = module; /* must come after
1858                                         free_list_init,
1859                                         otherwise ctx gets
1860                                         clobbered */
1861     if (OPAL_SUCCESS != rc) {
1862         goto error;
1863     }
1864 
1865     /* Post receive descriptors */
1866     for (i = 0; i < rd_num; i++) {
1867         USNIC_COMPAT_FREE_LIST_GET(&channel->recv_segs, item);
1868         assert(NULL != item);
1869         rseg = (opal_btl_usnic_recv_segment_t*)item;
1870 
1871         if (NULL == rseg) {
1872             opal_show_help("help-mpi-btl-usnic.txt",
1873                            "internal error during init",
1874                            true,
1875                            opal_process_info.nodename,
1876                            module->linux_device_name,
1877                            "Failed to get receive buffer from freelist",
1878                            __FILE__, __LINE__);
1879             goto error;
1880         }
1881 
1882         /* cannot find length from constructor, set it now */
1883         rseg->rs_len = segsize;
1884 
1885         rc = fi_recv(channel->ep, rseg->rs_protocol_header, segsize,
1886                      NULL, FI_ADDR_UNSPEC, rseg);
1887         if (0 != rc) {
1888             opal_show_help("help-mpi-btl-usnic.txt",
1889                            "internal error during init",
1890                            true,
1891                            opal_process_info.nodename,
1892                            module->linux_device_name,
1893                            "Failed to post receive buffer",
1894                            __FILE__, __LINE__);
1895             goto error;
1896         }
1897     }
1898 
1899     return OPAL_SUCCESS;
1900 
1901 error:
1902     finalize_one_channel(module, channel);
1903     return OPAL_ERROR;
1904 }
1905 
1906 /*
1907  * generate initial send sequence number
1908  */
1909 static opal_btl_usnic_seq_t
1910 get_initial_seq_no(void)
1911 {
1912     opal_btl_usnic_seq_t isn;
1913 
1914     isn = (opal_btl_usnic_seq_t)opal_rand(&opal_btl_usnic_rand_buff);
1915 
1916     return isn;
1917 }
1918 
1919 /*************************************************************************
1920  The following routines are all short utility / convenience functions
1921  for module_init().
1922 *************************************************************************/
1923 
1924 /*
1925  * Setup some globals on the module
1926  */
1927 static void init_module_globals(opal_btl_usnic_module_t *module)
1928 {
1929     OBJ_CONSTRUCT(&module->all_endpoints_lock, opal_mutex_t);
1930 }
1931 
1932 
1933 /*
1934  * Initialize our local modex entry from the device attributes
1935  */
1936 static void init_local_modex_part1(opal_btl_usnic_module_t *module)
1937 {
1938     /* Fill (most of) the local address information on the module.  We
1939        don't have the following yet: qp numbers, header length,
1940        connectivity checker UDP port. */
1941     opal_btl_usnic_modex_t *modex = &module->local_modex;
1942     struct fi_info *info = module->fabric_info;
1943     struct fi_usnic_info *uip = &module->usnic_info;
1944     struct sockaddr_in *sin;
1945 
1946     sin = info->src_addr;
1947     modex->ipv4_addr =       sin->sin_addr.s_addr;
1948     modex->netmask =         uip->ui.v1.ui_netmask_be;
1949     modex->max_msg_size =    info->ep_attr->max_msg_size;
1950     modex->link_speed_mbps = uip->ui.v1.ui_link_speed;
1951 
1952     opal_btl_usnic_snprintf_ipv4_addr(module->if_ipv4_addr_str,
1953                                       sizeof(module->if_ipv4_addr_str),
1954                                       modex->ipv4_addr,
1955                                       modex->netmask);
1956 
1957     opal_output_verbose(5, USNIC_OUT,
1958                         "btl:usnic: %s IP charactertics: %s, %u Mbps",
1959                         module->linux_device_name,
1960                         module->if_ipv4_addr_str,
1961                         modex->link_speed_mbps);
1962 }
1963 
1964 /*
1965  * Find the header length for our transport.
1966  *
1967  * Do this super-early in the startup process because we need it to
1968  * calculate some payload lengths (and indirectly, some queue
1969  * lengths).
1970  */
1971 static void init_find_transport_header_len(opal_btl_usnic_module_t *module)
1972 {
1973     mca_btl_usnic_component.transport_header_len =
1974         module->fabric_info->ep_attr->msg_prefix_size;
1975     mca_btl_usnic_component.transport_protocol =
1976         module->fabric_info->ep_attr->protocol;
1977 
1978     /* The usnic provider in libfabric v1.0.0 (i.e., API v1.0) treated
1979        FI_MSG_PREFIX inconsistently between senders and receivers.  It
1980        was corrected in libfabric v1.1.0 (i.e., API v1.1), meaning
1981        that FI_MSG_PREFIX is treated consistently between senders and
1982        receivers.
1983 
1984        So check what version of the libfabric API we have, and setup
1985        to use the "old" (inconsistent) MSG_PREFIX behavior, or the
1986        "new" MSG_PREFIX (consistent) behavior.
1987 
1988        NOTE: This is a little redundant; we're setting a
1989        component-level attribute during each module's setup.  We do
1990        this here (and not earlier, when we check fi_version() during
1991        the component setup) because we can't obtain the value of the
1992        endpoint msg_prefix_size until we setup the first module.
1993        Also, it's safe because each module will set the component
1994        attribute to the same value.  So it's ok. */
1995     uint32_t libfabric_api;
1996     libfabric_api = fi_version();
1997     if (1 == FI_MAJOR(libfabric_api) &&
1998         0 == FI_MINOR(libfabric_api)) {
1999         mca_btl_usnic_component.prefix_send_offset = 0;
2000     } else {
2001         mca_btl_usnic_component.prefix_send_offset =
2002             module->fabric_info->ep_attr->msg_prefix_size;
2003     }
2004 }
2005 
2006 /*
2007  * How many xQ entries do we want?
2008  */
2009 static void init_queue_lengths(opal_btl_usnic_module_t *module)
2010 {
2011     bool cq_is_sum = false;
2012     if (-1 == mca_btl_usnic_component.cq_num) {
2013         cq_is_sum = true;
2014     }
2015 
2016     if (-1 == mca_btl_usnic_component.sd_num) {
2017         module->sd_num = module->fabric_info->tx_attr->size;
2018     } else {
2019         module->sd_num = mca_btl_usnic_component.sd_num;
2020     }
2021     if (-1 == mca_btl_usnic_component.rd_num) {
2022         module->rd_num = module->fabric_info->rx_attr->size;
2023     } else {
2024         module->rd_num = mca_btl_usnic_component.rd_num;
2025     }
2026     if (cq_is_sum) {
2027         module->cq_num = module->rd_num + module->sd_num;
2028     } else {
2029         module->cq_num = mca_btl_usnic_component.cq_num;
2030     }
2031     module->av_eq_num = mca_btl_usnic_component.av_eq_num;
2032 
2033     /*
2034      * Queue sizes for priority channel scale with # of endpoint. A
2035      * little bit of chicken and egg here, we really want procs*ports,
2036      * but we can't know # of ports until we try to initialize, so
2037      * 32*num_procs is best guess.  User can always override.
2038      */
2039 
2040     if (-1 == mca_btl_usnic_component.prio_sd_num) {
2041         module->prio_sd_num = max(128, 32 * USNIC_MCW_SIZE) - 1;
2042     } else {
2043         module->prio_sd_num = mca_btl_usnic_component.prio_sd_num;
2044     }
2045     if (module->prio_sd_num > 0 &&
2046         (unsigned) module->prio_sd_num >
2047          module->fabric_info->tx_attr->size) {
2048         module->prio_sd_num = module->fabric_info->tx_attr->size;
2049     }
2050     if (-1 == mca_btl_usnic_component.prio_rd_num) {
2051         module->prio_rd_num =
2052             max(128, 32 * USNIC_MCW_SIZE) - 1;
2053     } else {
2054         module->prio_rd_num = mca_btl_usnic_component.prio_rd_num;
2055     }
2056     if (module->prio_rd_num > 0 &&
2057         (unsigned) module->prio_rd_num >
2058          module->fabric_info->rx_attr->size) {
2059         module->prio_rd_num = module->fabric_info->rx_attr->size;
2060     }
2061     if (cq_is_sum) {
2062         module->prio_cq_num = module->prio_rd_num + module->prio_sd_num;
2063     } else {
2064         module->prio_cq_num = module->cq_num;
2065     }
2066 }
2067 
2068 static void init_payload_lengths(opal_btl_usnic_module_t *module)
2069 {
2070     /* Find the max payload this port can handle */
2071     module->max_frag_payload =
2072         module->local_modex.max_msg_size - /* start with the MTU */
2073         sizeof(opal_btl_usnic_btl_header_t) - /* subtract size of
2074                                                  the BTL header */
2075         mca_btl_usnic_component.prefix_send_offset;
2076 
2077     /* same, but use chunk header */
2078     module->max_chunk_payload =
2079         module->local_modex.max_msg_size -
2080         sizeof(opal_btl_usnic_btl_chunk_header_t) -
2081         mca_btl_usnic_component.prefix_send_offset;
2082 
2083     /* Priorirty queue MTU and max size */
2084     if (0 == module->max_tiny_msg_size) {
2085         module->max_tiny_msg_size = 768;
2086     }
2087     module->max_tiny_payload = module->max_tiny_msg_size -
2088         sizeof(opal_btl_usnic_btl_header_t);
2089 }
2090 
2091 static void init_pml_values(opal_btl_usnic_module_t *module)
2092 {
2093     module->super.btl_bandwidth = module->local_modex.link_speed_mbps;
2094 
2095     /* If the eager rndv limit is 0, initialize it to default */
2096     if (0 == module->super.btl_rndv_eager_limit) {
2097         module->super.btl_rndv_eager_limit = USNIC_DFLT_RNDV_EAGER_LIMIT;
2098     }
2099 
2100     /* If the eager send limit is 0, initialize it to default */
2101     if (0 == module->super.btl_eager_limit) {
2102         /* 150k for 1 module, 25k for >1 module */
2103         if (1 == mca_btl_usnic_component.num_modules) {
2104             module->super.btl_eager_limit =
2105                 USNIC_DFLT_EAGER_LIMIT_1DEVICE;
2106         } else {
2107             module->super.btl_eager_limit =
2108                 USNIC_DFLT_EAGER_LIMIT_NDEVICES;
2109         }
2110     }
2111 
2112     /* Since we emulate PUT, max_send_size can be same as
2113        eager_limit */
2114     module->super.btl_max_send_size =
2115         module->super.btl_eager_limit;
2116 
2117 #if BTL_VERSION == 30
2118     module->super.btl_put_limit =
2119         module->super.btl_eager_limit;
2120 #endif
2121 }
2122 
2123 static void init_senders(opal_btl_usnic_module_t *module)
2124 {
2125     /* Make a hash table of senders */
2126     OBJ_CONSTRUCT(&module->senders, opal_hash_table_t);
2127     /* JMS This is a fixed size -- BAD!  But since hash table
2128        doesn't grow dynamically, I don't know what size to put
2129        here.  I think the long-term solution is to write a better
2130        hash table... :-( */
2131     opal_hash_table_init(&module->senders, 4096);
2132 }
2133 
2134 static void init_connectivity_checker(opal_btl_usnic_module_t *module)
2135 {
2136     /* Setup a connectivity agent listener */
2137     int rc = opal_btl_usnic_connectivity_listen(module);
2138     if (OPAL_SUCCESS != rc) {
2139         OPAL_ERROR_LOG(rc);
2140         opal_btl_usnic_util_abort("Failed to notify connectivity agent to listen",
2141                                   __FILE__, __LINE__);
2142     }
2143 }
2144 
2145 static void init_hwloc(opal_btl_usnic_module_t *module)
2146 {
2147     /* If this process is bound to a single NUMA locality, calculate
2148        its NUMA distance from this usNIC device */
2149     if (mca_btl_usnic_component.want_numa_device_assignment) {
2150         opal_btl_usnic_hwloc_distance(module);
2151     } else {
2152         opal_output_verbose(5, USNIC_OUT,
2153                             "btl:usnic: not sorting devices by NUMA distance (MCA btl_usnic_want_numa_device_assignment)");
2154     }
2155 }
2156 
2157 static void init_procs(opal_btl_usnic_module_t *module)
2158 {
2159     /* Setup the pointer array for the procs that will be used by this
2160        module */
2161     OBJ_CONSTRUCT(&module->all_procs, opal_pointer_array_t);
2162     opal_pointer_array_init(&module->all_procs, USNIC_MCW_SIZE, INT_MAX, 32);
2163 }
2164 
2165 /*
2166  * Setup the mpool
2167  */
2168 static int init_mpool(opal_btl_usnic_module_t *module)
2169 {
2170     struct mca_rcache_base_resources_t rcache_resources;
2171 
2172     rcache_resources.reg_data = (void*)module;
2173     rcache_resources.sizeof_reg = sizeof(opal_btl_usnic_reg_t);
2174     rcache_resources.register_mem = usnic_reg_mr;
2175     rcache_resources.deregister_mem = usnic_dereg_mr;
2176     rcache_resources.cache_name = mca_btl_usnic_component.usnic_rcache_name;
2177     module->rcache =
2178         mca_rcache_base_module_create (mca_btl_usnic_component.usnic_rcache_name,
2179                                        &module->super, &rcache_resources);
2180     if (NULL == module->rcache) {
2181         opal_show_help("help-mpi-btl-usnic.txt",
2182                        "internal error during init",
2183                        true,
2184                        opal_process_info.nodename,
2185                        module->linux_device_name,
2186                        "create rcache", __FILE__, __LINE__);
2187         return OPAL_ERROR;
2188     }
2189     module->super.btl_mpool =
2190         mca_mpool_base_module_lookup (mca_btl_usnic_component.usnic_mpool_hints);
2191     if (NULL == module->super.btl_mpool) {
2192         opal_show_help("help-mpi-btl-usnic.txt",
2193                        "internal error during init",
2194                        true,
2195                        opal_process_info.nodename,
2196                        module->linux_device_name,
2197                        "create mpool", __FILE__, __LINE__);
2198         return OPAL_ERROR;
2199     }
2200 
2201     return OPAL_SUCCESS;
2202 }
2203 
2204 static int init_channels(opal_btl_usnic_module_t *module)
2205 {
2206     int rc;
2207     struct fi_av_attr av_attr;
2208     struct fi_eq_attr eq_attr;
2209 
2210     memset(&module->mod_channels[0], 0,
2211            sizeof(module->mod_channels[0]));
2212     memset(&module->mod_channels[1], 0,
2213            sizeof(module->mod_channels[1]));
2214 
2215     memset(&av_attr, 0, sizeof(av_attr));
2216     av_attr.type = FI_AV_MAP;
2217     av_attr.flags = FI_EVENT;
2218     rc = fi_av_open(module->domain, &av_attr, &module->av, NULL);
2219     if (rc != OPAL_SUCCESS) {
2220         goto destroy;
2221     }
2222 
2223     rc = fi_open_ops(&module->av->fid, FI_USNIC_AV_OPS_1, 0,
2224             (void **)&module->usnic_av_ops, NULL);
2225     if (rc != OPAL_SUCCESS) {
2226         goto destroy;
2227     }
2228 
2229     memset(&eq_attr, 0, sizeof(eq_attr));
2230     eq_attr.size = module->av_eq_num;
2231     eq_attr.wait_obj = FI_WAIT_UNSPEC;
2232     rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
2233     if (rc != OPAL_SUCCESS) {
2234         goto destroy;
2235     }
2236     // Save the size of the created EQ
2237     module->av_eq_size = eq_attr.size;
2238 
2239     eq_attr.wait_obj = FI_WAIT_FD;
2240     rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
2241     if (rc != OPAL_SUCCESS) {
2242         goto destroy;
2243     }
2244 
2245     rc = fi_av_bind(module->av, &module->av_eq->fid, 0);
2246     if (rc != OPAL_SUCCESS) {
2247         goto destroy;
2248     }
2249 
2250     rc = fi_domain_bind(module->domain, &module->dom_eq->fid, 0);
2251     if (rc != OPAL_SUCCESS) {
2252         goto destroy;
2253     }
2254 
2255     /* initialize data and priority channels */
2256     rc = init_one_channel(module,
2257             USNIC_PRIORITY_CHANNEL,
2258             module->max_tiny_msg_size,
2259             module->prio_rd_num, module->prio_sd_num, module->prio_cq_num);
2260     if (rc != OPAL_SUCCESS) {
2261         goto destroy;
2262     }
2263     rc = init_one_channel(module,
2264             USNIC_DATA_CHANNEL,
2265             module->fabric_info->ep_attr->max_msg_size,
2266             module->rd_num, module->sd_num, module->cq_num);
2267     if (rc != OPAL_SUCCESS) {
2268         goto destroy;
2269     }
2270 
2271     return OPAL_SUCCESS;
2272 
2273  destroy:
2274     finalize_one_channel(module,
2275                          &module->mod_channels[USNIC_DATA_CHANNEL]);
2276     finalize_one_channel(module,
2277                          &module->mod_channels[USNIC_PRIORITY_CHANNEL]);
2278 
2279     return rc;
2280 }
2281 
2282 /* Fill in the UDP ports of the channel QPs, and fill in the wire
2283    header length */
2284 static void init_local_modex_part2(opal_btl_usnic_module_t *module)
2285 {
2286     module->local_modex.isn = get_initial_seq_no();
2287 
2288     /* Place EP number in our local modex information */
2289     for (int id = 0; id < USNIC_NUM_CHANNELS; ++id) {
2290         opal_btl_usnic_channel_t *channel = &module->mod_channels[id];
2291         struct sockaddr_in *sin;
2292         sin = channel->info->src_addr;
2293         module->local_modex.ports[id] = ntohs(sin->sin_port);
2294         module->local_modex.protocol = channel->info->ep_attr->protocol;
2295     }
2296 }
2297 
2298 static void init_async_event(opal_btl_usnic_module_t *module)
2299 {
2300     int fd;
2301     int ret;
2302 
2303     ret = fi_control(&module->dom_eq->fid, FI_GETWAIT, &fd);
2304     if (ret != 0) {
2305         opal_show_help("help-mpi-btl-usnic.txt",
2306                     "libfabric API failed",
2307                    true,
2308                    opal_process_info.nodename,
2309                    module->linux_device_name,
2310                    "fi_control(eq, FI_GETWAIT)", __FILE__, __LINE__,
2311                    ret,
2312                    fi_strerror(-ret));
2313         return;
2314     }
2315 
2316     /* Get the fd to receive events on this device.  Keep this in the
2317        sync event base (not the async event base) */
2318     opal_event_set(opal_sync_event_base, &(module->device_async_event), fd,
2319                    OPAL_EV_READ | OPAL_EV_PERSIST,
2320                    module_async_event_callback, module);
2321     opal_event_add(&(module->device_async_event), NULL);
2322     module->device_async_event_active = true;
2323 }
2324 
2325 static void init_random_objects(opal_btl_usnic_module_t *module)
2326 {
2327     /* list of all endpoints */
2328     opal_mutex_lock(&module->all_endpoints_lock);
2329     OBJ_CONSTRUCT(&(module->all_endpoints), opal_list_t);
2330     module->all_endpoints_constructed = true;
2331     opal_mutex_unlock(&module->all_endpoints_lock);
2332 
2333     /* Pending send segs list */
2334     OBJ_CONSTRUCT(&module->pending_resend_segs, opal_list_t);
2335     OBJ_CONSTRUCT(&module->endpoints_that_need_acks, opal_list_t);
2336 
2337     /* list of endpoints that are ready to send */
2338     OBJ_CONSTRUCT(&module->endpoints_with_sends, opal_list_t);
2339 }
2340 
2341 static void init_freelists(opal_btl_usnic_module_t *module)
2342 {
2343     int rc __opal_attribute_unused__;
2344     uint32_t segsize;
2345 
2346     segsize = (module->local_modex.max_msg_size +
2347            opal_cache_line_size - 1) &
2348         ~(opal_cache_line_size - 1);
2349 
2350     /* Send frags freelists */
2351     OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t);
2352     rc = usnic_compat_free_list_init(&module->small_send_frags,
2353                              sizeof(opal_btl_usnic_small_send_frag_t) +
2354                                  mca_btl_usnic_component.prefix_send_offset,
2355                              opal_cache_line_size,
2356                              OBJ_CLASS(opal_btl_usnic_small_send_frag_t),
2357                              segsize,
2358                              opal_cache_line_size,
2359                              module->sd_num * 4,
2360                              -1,
2361                              module->sd_num / 2,
2362                              module->super.btl_mpool,
2363                              0 /* mpool reg flags */,
2364                              module->rcache,
2365                              NULL /* item_init */,
2366                              NULL /* item_init_context */);
2367     assert(OPAL_SUCCESS == rc);
2368 
2369     OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t);
2370     rc = usnic_compat_free_list_init(&module->large_send_frags,
2371                              sizeof(opal_btl_usnic_large_send_frag_t) +
2372                                  mca_btl_usnic_component.prefix_send_offset,
2373                              opal_cache_line_size,
2374                              OBJ_CLASS(opal_btl_usnic_large_send_frag_t),
2375                              0,  /* payload size */
2376                              0,  /* payload align */
2377                              module->sd_num / 8,
2378                              -1,
2379                              module->sd_num / 8,
2380                              NULL,
2381                              0 /* mpool reg flags */,
2382                              NULL /* unused0 */,
2383                              NULL /* item_init */,
2384                              NULL /* item_init_context */);
2385     assert(OPAL_SUCCESS == rc);
2386 
2387     OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t);
2388     rc = usnic_compat_free_list_init(&module->put_dest_frags,
2389                              sizeof(opal_btl_usnic_put_dest_frag_t) +
2390                                  mca_btl_usnic_component.prefix_send_offset,
2391                              opal_cache_line_size,
2392                              OBJ_CLASS(opal_btl_usnic_put_dest_frag_t),
2393                              0,  /* payload size */
2394                              0,  /* payload align */
2395                              module->sd_num / 8,
2396                              -1,
2397                              module->sd_num / 8,
2398                              NULL,
2399                              0 /* mpool reg flags */,
2400                              NULL /* unused0 */,
2401                              NULL /* item_init */,
2402                              NULL /* item_init_context */);
2403     assert(OPAL_SUCCESS == rc);
2404 
2405     /* list of segments to use for sending */
2406     OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t);
2407     rc = usnic_compat_free_list_init(&module->chunk_segs,
2408                              sizeof(opal_btl_usnic_chunk_segment_t) +
2409                                  mca_btl_usnic_component.prefix_send_offset,
2410                              opal_cache_line_size,
2411                              OBJ_CLASS(opal_btl_usnic_chunk_segment_t),
2412                              segsize,
2413                              opal_cache_line_size,
2414                              module->sd_num * 4,
2415                              -1,
2416                              module->sd_num / 2,
2417                              module->super.btl_mpool,
2418                              0 /* mpool reg flags */,
2419                              module->rcache,
2420                              NULL /* item_init */,
2421                              NULL /* item_init_context */);
2422     assert(OPAL_SUCCESS == rc);
2423 
2424     /* ACK segments freelist */
2425     uint32_t ack_segment_len;
2426     ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) +
2427             opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
2428     OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t);
2429     rc = usnic_compat_free_list_init(&module->ack_segs,
2430                              sizeof(opal_btl_usnic_ack_segment_t) +
2431                                  mca_btl_usnic_component.prefix_send_offset,
2432                              opal_cache_line_size,
2433                              OBJ_CLASS(opal_btl_usnic_ack_segment_t),
2434                              ack_segment_len,
2435                              opal_cache_line_size,
2436                              module->sd_num * 4,
2437                              -1,
2438                              module->sd_num / 2,
2439                              module->super.btl_mpool,
2440                              0 /* mpool reg flags */,
2441                              module->rcache,
2442                              NULL /* item_init */,
2443                              NULL /* item_init_context */);
2444     assert(OPAL_SUCCESS == rc);
2445 
2446     /*
2447      * Initialize pools of large recv buffers
2448      *
2449      * NOTE: (last_pool < first_pool) is _not_ erroneous; recv buffer
2450      * pools simply won't be used in that case.
2451      */
2452     module->first_pool = 16; /* 64 kiB */
2453     module->last_pool = usnic_fls(module->super.btl_eager_limit-1);
2454     module->module_recv_buffers = calloc(module->last_pool+1,
2455             sizeof(opal_free_list_t));
2456     assert(module->module_recv_buffers != NULL);
2457     for (int i = module->first_pool; i <= module->last_pool; ++i) {
2458         size_t elt_size = sizeof(opal_btl_usnic_rx_buf_t) - 1 + (1 << i);
2459         OBJ_CONSTRUCT(&module->module_recv_buffers[i], opal_free_list_t);
2460         rc = usnic_compat_free_list_init(&module->module_recv_buffers[i],
2461                                  elt_size,
2462                                  opal_cache_line_size,
2463                                  OBJ_CLASS(opal_btl_usnic_rx_buf_t),
2464                                  0,   /* payload size */
2465                                  0,   /* payload align */
2466                                  128,   /* init elts to alloc */
2467                                  128, /* max elts to alloc */
2468                                  128,   /* num elts per alloc */
2469                                  NULL /* mpool */,
2470                                  0 /* mpool reg flags */,
2471                                  NULL /* unused0 */,
2472                                  NULL /* item_init */,
2473                                  NULL /* item_init_context */);
2474         assert(OPAL_SUCCESS == rc);
2475     }
2476 }
2477 
2478 /*
2479  * Initialize the btl module by allocating
2480  *  a memory pool, priority and data channels, and free lists
2481  */
2482 int opal_btl_usnic_module_init(opal_btl_usnic_module_t *module)
2483 {
2484     init_module_globals(module);
2485     init_local_modex_part1(module);
2486     init_find_transport_header_len(module);
2487     init_queue_lengths(module);
2488     init_payload_lengths(module);
2489     init_pml_values(module);
2490     init_senders(module);
2491     init_connectivity_checker(module);
2492     init_hwloc(module);
2493     init_procs(module);
2494 
2495     int ret;
2496     if (OPAL_SUCCESS != (ret = init_mpool(module)) ||
2497         OPAL_SUCCESS != (ret = init_channels(module))) {
2498         mca_rcache_base_module_destroy (module->rcache);
2499         return ret;
2500     }
2501 
2502     init_local_modex_part2(module);
2503     init_async_event(module);
2504     init_random_objects(module);
2505     init_freelists(module);
2506     opal_btl_usnic_stats_init(module);
2507 
2508     /* Setup a connectivity listener.  This fills in the last part of
2509        the local modex info (the connectivity listener UDP port) */
2510     if (mca_btl_usnic_component.connectivity_enabled) {
2511         int rc = opal_btl_usnic_connectivity_listen(module);
2512         if (OPAL_SUCCESS != rc) {
2513             OPAL_ERROR_LOG(rc);
2514             opal_btl_usnic_util_abort("Failed to notify connectivity agent to listen",
2515                                       __FILE__, __LINE__);
2516         }
2517     } else {
2518         /* If we're not doing a connectivity check, just set the port
2519            to 0 */
2520         module->local_modex.connectivity_udp_port = 0;
2521     }
2522 
2523     return OPAL_SUCCESS;
2524 }
2525 
2526 
2527 static int usnic_ft_event(int state)
2528 {
2529     return OPAL_SUCCESS;
2530 }
2531 
2532 
2533 opal_btl_usnic_module_t opal_btl_usnic_module_template = {
2534     .super = {
2535         .btl_component = &mca_btl_usnic_component.super,
2536 
2537 #if BTL_VERSION == 20
2538         .btl_prepare_dst = opal_btl_usnic_prepare_dst,
2539         .btl_seg_size = sizeof(mca_btl_base_segment_t),
2540 #elif BTL_VERSION == 30
2541         .btl_atomic_flags = 0,
2542         .btl_registration_handle_size = 0,
2543 
2544         .btl_get_limit = 0,
2545         .btl_get_alignment = 0,
2546         .btl_put_limit = 0,
2547         .btl_put_alignment = 0,
2548 
2549         .btl_atomic_op = NULL,
2550         .btl_atomic_fop = NULL,
2551         .btl_atomic_cswap = NULL,
2552 #endif
2553 
2554         .btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT,
2555         .btl_flags =
2556             MCA_BTL_FLAGS_SEND |
2557             MCA_BTL_FLAGS_SEND_INPLACE |
2558             /* Need to set FLAGS_SINGLE_ADD_PROCS until
2559                btl_recv.h:lookup_sender() can handle an incoming
2560                message with an unknown sender. */
2561             MCA_BTL_FLAGS_SINGLE_ADD_PROCS,
2562 
2563         .btl_add_procs = usnic_add_procs,
2564         .btl_del_procs = usnic_del_procs,
2565         .btl_register = NULL,
2566         .btl_finalize = usnic_finalize,
2567 
2568         .btl_alloc = usnic_alloc,
2569         .btl_free = usnic_free,
2570         .btl_prepare_src = opal_btl_usnic_prepare_src,
2571         .btl_send = usnic_send,
2572         .btl_sendi = NULL,
2573         .btl_put = opal_btl_usnic_put,
2574         .btl_get = NULL,
2575         .btl_dump = mca_btl_base_dump,
2576 
2577         .btl_mpool = NULL,
2578         .btl_register_error = usnic_register_pml_err_cb,
2579         .btl_ft_event = usnic_ft_event
2580     }
2581 };

/* [<][>][^][v][top][bottom][index][help] */