root/opal/mca/btl/usnic/btl_usnic_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. usnic_component_open
  2. usnic_component_close
  3. usnic_modex_send
  4. check_reg_mem_basics
  5. check_usnic_config
  6. usnic_clock_callback
  7. parse_ifex_str
  8. filter_module
  9. free_filter
  10. usnic_component_init
  11. usnic_component_progress
  12. usnic_handle_completion
  13. usnic_handle_cq_error
  14. usnic_component_progress_2
  15. dump_endpoint
  16. opal_btl_usnic_component_debug

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2011 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006      Sandia National Laboratories. All rights
  14  *                         reserved.
  15  * Copyright (c) 2008-2019 Cisco Systems, Inc.  All rights reserved
  16  * Copyright (c) 2012-2014 Los Alamos National Security, LLC.  All rights
  17  *                         reserved.
  18  * Copyright (c) 2014      Intel, Inc. All rights reserved.
  19  * Copyright (c) 2015      Research Organization for Information Science
  20  *                         and Technology (RIST). All rights reserved.
  21  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  22  * $COPYRIGHT$
  23  *
  24  * Additional copyrights may follow
  25  *
  26  * $HEADER$
  27  */
  28 
  29 /*
  30  * General notes:
  31  *
  32  * - OB1 handles out of order receives
  33  * - OB1 does NOT handle duplicate receives well (it probably does for
  34  *   MATCH tags, but for non-MATCH tags, it doesn't have enough info
  35  *   to know when duplicates are received), so we have to ensure not
  36  *   to pass duplicates up to the PML.
  37  */
  38 
  39 #include "opal_config.h"
  40 
  41 #include <string.h>
  42 #include <ctype.h>
  43 #include <errno.h>
  44 #include <unistd.h>
  45 #include <stdlib.h>
  46 #include <sys/time.h>
  47 #include <sys/resource.h>
  48 #include <sys/types.h>
  49 #include <sys/stat.h>
  50 #include <fcntl.h>
  51 
  52 #include <rdma/fabric.h>
  53 
  54 #include "opal_stdint.h"
  55 #include "opal/prefetch.h"
  56 #include "opal/mca/timer/base/base.h"
  57 #include "opal/util/argv.h"
  58 #include "opal/util/net.h"
  59 #include "opal/util/if.h"
  60 #include "opal/util/printf.h"
  61 #include "opal/mca/base/mca_base_var.h"
  62 #include "opal/mca/memchecker/base/base.h"
  63 #include "opal/util/show_help.h"
  64 #include "opal/constants.h"
  65 
  66 #include "opal/mca/btl/btl.h"
  67 #include "opal/mca/btl/base/base.h"
  68 #include "opal/util/proc.h"
  69 
  70 #include "btl_usnic.h"
  71 #include "btl_usnic_connectivity.h"
  72 #include "btl_usnic_frag.h"
  73 #include "btl_usnic_endpoint.h"
  74 #include "btl_usnic_module.h"
  75 #include "btl_usnic_stats.h"
  76 #include "btl_usnic_util.h"
  77 #include "btl_usnic_ack.h"
  78 #include "btl_usnic_send.h"
  79 #include "btl_usnic_recv.h"
  80 #include "btl_usnic_proc.h"
  81 #include "btl_usnic_test.h"
  82 
  83 #define OPAL_BTL_USNIC_NUM_COMPLETIONS 500
  84 
  85 /* MPI_THREAD_MULTIPLE_SUPPORT */
  86 opal_recursive_mutex_t btl_usnic_lock =  OPAL_RECURSIVE_MUTEX_STATIC_INIT;
  87 
  88 /* RNG buffer definition */
  89 opal_rng_buff_t opal_btl_usnic_rand_buff = {{0}};
  90 
  91 /* simulated clock */
  92 uint64_t opal_btl_usnic_ticks = 0;
  93 
  94 static opal_event_t usnic_clock_timer_event;
  95 static bool usnic_clock_timer_event_set = false;
  96 static struct timeval usnic_clock_timeout;
  97 
  98 /* set to true in a debugger to enable even more verbose output when calling
  99  * opal_btl_usnic_component_debug */
 100 static volatile bool dump_bitvectors = false;
 101 
 102 static int usnic_component_open(void);
 103 static int usnic_component_close(void);
 104 static mca_btl_base_module_t **
 105 usnic_component_init(int* num_btl_modules, bool want_progress_threads,
 106                        bool want_mpi_threads);
 107 static int usnic_component_progress(void);
 108 
 109 /* Types for filtering interfaces */
 110 typedef struct filter_elt_t {
 111     bool is_netmask;
 112 
 113     /* valid iff is_netmask==false */
 114     char *if_name;
 115 
 116     /* valid iff is_netmask==true */
 117     uint32_t addr_be; /* in network byte order */
 118     uint32_t netmask_be;
 119 } filter_elt_t;
 120 
 121 typedef struct usnic_if_filter_t {
 122     int n_elt;
 123     filter_elt_t *elts;
 124 } usnic_if_filter_t;
 125 
 126 static bool filter_module(opal_btl_usnic_module_t *module,
 127                           usnic_if_filter_t *filter,
 128                           bool filter_incl);
 129 static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
 130                                          const char *name);
 131 static void free_filter(usnic_if_filter_t *filter);
 132 
 133 
 134 opal_btl_usnic_component_t mca_btl_usnic_component = {
 135     .super = {
 136         /* First, the mca_base_component_t struct containing meta information
 137            about the component itself */
 138         .btl_version = {
 139             USNIC_BTL_DEFAULT_VERSION("usnic"),
 140             .mca_open_component = usnic_component_open,
 141             .mca_close_component = usnic_component_close,
 142             .mca_register_component_params = opal_btl_usnic_component_register,
 143         },
 144         .btl_data = {
 145             /* The component is not checkpoint ready */
 146             .param_field = MCA_BASE_METADATA_PARAM_NONE
 147         },
 148 
 149         .btl_init = usnic_component_init,
 150         .btl_progress = usnic_component_progress,
 151     }
 152 };
 153 
 154 
 155 /*
 156  *  Called by MCA framework to open the component
 157  */
 158 static int usnic_component_open(void)
 159 {
 160     /* initialize state */
 161     mca_btl_usnic_component.num_modules = 0;
 162     mca_btl_usnic_component.usnic_all_modules = NULL;
 163     mca_btl_usnic_component.usnic_active_modules = NULL;
 164     mca_btl_usnic_component.transport_header_len = -1;
 165     mca_btl_usnic_component.prefix_send_offset = 0;
 166 
 167     /* initialize objects */
 168     OBJ_CONSTRUCT(&mca_btl_usnic_component.usnic_procs, opal_list_t);
 169 
 170     /* Sanity check: if_include and if_exclude need to be mutually
 171        exclusive */
 172     if (OPAL_SUCCESS !=
 173         mca_base_var_check_exclusive("opal",
 174             mca_btl_usnic_component.super.btl_version.mca_type_name,
 175             mca_btl_usnic_component.super.btl_version.mca_component_name,
 176             "if_include",
 177             mca_btl_usnic_component.super.btl_version.mca_type_name,
 178             mca_btl_usnic_component.super.btl_version.mca_component_name,
 179             "if_exclude")) {
 180         /* Return ERR_NOT_AVAILABLE so that a warning message about
 181            "open" failing is not printed */
 182         return OPAL_ERR_NOT_AVAILABLE;
 183     }
 184 
 185     return OPAL_SUCCESS;
 186 }
 187 
 188 
 189 /*
 190  * Component cleanup
 191  */
 192 static int usnic_component_close(void)
 193 {
 194     /* Note that this list should already be empty, because:
 195        - module.finalize() is invoked before component.close()
 196        - module.finalize() RELEASEs each proc that it was using
 197        - this should drive down the ref count on procs to 0
 198        - procs remove themselves from the component.usnic_procs list
 199          in their destructor */
 200     OBJ_DESTRUCT(&mca_btl_usnic_component.usnic_procs);
 201 
 202     if (usnic_clock_timer_event_set) {
 203         opal_event_del(&usnic_clock_timer_event);
 204         usnic_clock_timer_event_set = false;
 205     }
 206 
 207     /* Finalize the connectivity client and agent */
 208     if (mca_btl_usnic_component.connectivity_enabled) {
 209         opal_btl_usnic_connectivity_client_finalize();
 210         opal_btl_usnic_connectivity_agent_finalize();
 211     }
 212     if (mca_btl_usnic_component.opal_evbase) {
 213         opal_progress_thread_finalize(NULL);
 214     }
 215 
 216     free(mca_btl_usnic_component.usnic_all_modules);
 217     free(mca_btl_usnic_component.usnic_active_modules);
 218 
 219 #if OPAL_BTL_USNIC_UNIT_TESTS
 220     /* clean up the unit test infrastructure */
 221     opal_btl_usnic_cleanup_tests();
 222 #endif
 223 
 224     OBJ_DESTRUCT(&btl_usnic_lock);
 225 
 226     return OPAL_SUCCESS;
 227 }
 228 
 229 
 230 /*
 231  * Register address information.  The modex will make this available
 232  * to all peers.
 233  */
 234 static int usnic_modex_send(void)
 235 {
 236     int rc;
 237     int i;
 238     size_t size;
 239     opal_btl_usnic_modex_t* modexes = NULL;
 240 
 241     if (0 == mca_btl_usnic_component.num_modules) {
 242         return OPAL_SUCCESS;
 243     }
 244 
 245     size = mca_btl_usnic_component.num_modules *
 246         sizeof(opal_btl_usnic_modex_t);
 247     modexes = (opal_btl_usnic_modex_t*) malloc(size);
 248     if (NULL == modexes) {
 249         return OPAL_ERR_OUT_OF_RESOURCE;
 250     }
 251 
 252     for (i = 0; i < mca_btl_usnic_component.num_modules; i++) {
 253         opal_btl_usnic_module_t* module =
 254             mca_btl_usnic_component.usnic_active_modules[i];
 255         modexes[i] = module->local_modex;
 256         opal_output_verbose(5, USNIC_OUT,
 257                             "btl:usnic: "
 258                             "control port:%d, "
 259                             "modex_send data port:%d, "
 260                             "%s",
 261                             modexes[i].ports[USNIC_PRIORITY_CHANNEL],
 262                             modexes[i].ports[USNIC_DATA_CHANNEL],
 263                             module->if_ipv4_addr_str);
 264     }
 265 
 266     usnic_compat_modex_send(&rc, &mca_btl_usnic_component.super.btl_version,
 267                             modexes, size);
 268     free(modexes);
 269 
 270     return rc;
 271 }
 272 
 273 
 274 /*
 275  * See if our memlock limit is >64K.  64K is the RHEL default memlock
 276  * limit; this check is a first-line-of-defense hueristic to see if
 277  * the user has set the memlock limit to *something*.
 278  *
 279  * We have other checks elsewhere (e.g., to ensure that QPs are able
 280  * to be allocated -- which also require registered memory -- and to
 281  * ensure that receive buffers can be registered, etc.), but this is a
 282  * good first check to ensure that a default OS case is satisfied.
 283  */
 284 static int check_reg_mem_basics(void)
 285 {
 286 #if HAVE_DECL_RLIMIT_MEMLOCK
 287     int ret = OPAL_SUCCESS;
 288     struct rlimit limit;
 289     char *str_limit = NULL;
 290 
 291     ret = getrlimit(RLIMIT_MEMLOCK, &limit);
 292     if (0 == ret) {
 293         if ((long) limit.rlim_cur > (64 * 1024) ||
 294             limit.rlim_cur == RLIM_INFINITY) {
 295             return OPAL_SUCCESS;
 296         } else {
 297             opal_asprintf(&str_limit, "%ld", (long)limit.rlim_cur);
 298         }
 299     } else {
 300         opal_asprintf(&str_limit, "Unknown");
 301     }
 302 
 303     opal_show_help("help-mpi-btl-usnic.txt", "check_reg_mem_basics fail",
 304                    true,
 305                    opal_process_info.nodename,
 306                    str_limit);
 307 
 308     return OPAL_ERR_OUT_OF_RESOURCE;
 309 #else
 310     /* If we don't have RLIMIT_MEMLOCK, then just bypass this
 311        safety/hueristic check. */
 312     return OPAL_SUCCESS;
 313 #endif
 314 }
 315 
 316 
 317 /*
 318  * Basic sanity checking for usNIC VFs / resources.
 319  */
 320 static int check_usnic_config(opal_btl_usnic_module_t *module,
 321         int num_local_procs)
 322 {
 323     char str[128];
 324     unsigned unlp;
 325     struct fi_usnic_info *uip;
 326 
 327     uip = &module->usnic_info;
 328 
 329     /* Note: we add one to num_local_procs to account for *this*
 330        process */
 331     unlp = (unsigned) num_local_procs + 1;
 332 
 333     /* usNIC allocates QPs as a combination of PCI virtual functions
 334        (VFs) and resources inside those VFs.  Ensure that:
 335 
 336        1. num_vfs (i.e., "usNICs") >= num_local_procs (to ensure that
 337           each MPI process will be able to have its own protection
 338           domain), and
 339        2. num_qps_per_vf >= NUM_CHANNELS
 340           (to ensure that each MPI process will be able to get the
 341           number of QPs it needs -- we know that every VF will have
 342           the same number of QPs), and
 343        3. num_cqs_per_vf >= NUM_CHANNELS
 344           (to ensure that each MPI process will be able to get the
 345           number of CQs that it needs) */
 346     if (uip->ui.v1.ui_num_vf < unlp) {
 347         snprintf(str, sizeof(str), "Not enough usNICs (found %d, need %d)",
 348                  uip->ui.v1.ui_num_vf, unlp);
 349         goto error;
 350     }
 351 
 352     if (uip->ui.v1.ui_qp_per_vf < USNIC_NUM_CHANNELS) {
 353         snprintf(str, sizeof(str), "Not enough transmit/receive queues per usNIC (found %d, need %d)",
 354                  uip->ui.v1.ui_qp_per_vf,
 355                  USNIC_NUM_CHANNELS);
 356         goto error;
 357     }
 358     if (uip->ui.v1.ui_cq_per_vf < USNIC_NUM_CHANNELS) {
 359         snprintf(str, sizeof(str),
 360                  "Not enough completion queues per usNIC (found %d, need %d)",
 361                  uip->ui.v1.ui_cq_per_vf,
 362                  USNIC_NUM_CHANNELS);
 363         goto error;
 364     }
 365 
 366     /* All is good! */
 367     return OPAL_SUCCESS;
 368 
 369  error:
 370     /* Sad panda */
 371     opal_show_help("help-mpi-btl-usnic.txt",
 372                    "not enough usnic resources",
 373                    true,
 374                    opal_process_info.nodename,
 375                    module->linux_device_name,
 376                    str);
 377     return OPAL_ERROR;
 378 }
 379 
 380 
 381 static void usnic_clock_callback(int fd, short flags, void *timeout)
 382 {
 383     /* 1ms == 1,000,000 ns */
 384     opal_btl_usnic_ticks += 1000000;
 385 
 386     /* run progress to make sure time change gets noticed */
 387     usnic_component_progress();
 388 
 389     opal_event_add(&usnic_clock_timer_event, timeout);
 390 }
 391 
 392 
 393 /* Parse a string which is a comma-separated list containing a mix of
 394  * interface names and IPv4 CIDR-format netmasks.
 395  *
 396  * Gracefully tolerates NULL pointer arguments by returning NULL.
 397  *
 398  * Returns a usnic_if_filter_t, which contains n_elt and a
 399  * corresponding array of found filter elements.  Caller is
 400  * responsible for freeing the returned usnic_if_filter_t, the array
 401  * of filter elements, and any strings in it (can do this via
 402  * free_filter()).
 403  */
 404 static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
 405                                          const char *name)
 406 {
 407     int i, ret;
 408     char **argv, *str, *tmp;
 409     struct sockaddr_storage argv_inaddr;
 410     uint32_t argv_prefix, addr;
 411     usnic_if_filter_t *filter;
 412     int n_argv;
 413 
 414     if (NULL == orig_str) {
 415         return NULL;
 416     }
 417 
 418     /* Get a wrapper for the filter */
 419     filter = calloc(sizeof(*filter), 1);
 420     if (NULL == filter) {
 421         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 422         return NULL;
 423     }
 424 
 425     argv = opal_argv_split(orig_str, ',');
 426     if (NULL == argv || 0 == (n_argv = opal_argv_count(argv))) {
 427         free(filter);
 428         opal_argv_free(argv);
 429         return NULL;
 430     }
 431 
 432     /* upper bound: each entry could be a mask */
 433     filter->elts = malloc(sizeof(*filter->elts) * n_argv);
 434     if (NULL == filter->elts) {
 435         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 436         free(filter);
 437         opal_argv_free(argv);
 438         return NULL;
 439     }
 440 
 441     /* Shuffle iface names to the beginning of the argv array.  Process each
 442      * netmask as we encounter it and append the resulting value to netmask_t
 443      * array which we will return. */
 444     filter->n_elt = 0;
 445     for (i = 0; NULL != argv[i]; ++i) {
 446         /* assume that all interface names begin with an alphanumeric
 447          * character, not a number */
 448         if (isalpha(argv[i][0])) {
 449             filter->elts[filter->n_elt].is_netmask = false;
 450             filter->elts[filter->n_elt].if_name = strdup(argv[i]);
 451             opal_output_verbose(20, USNIC_OUT,
 452                                 "btl:usnic:parse_ifex_str: parsed %s device name: %s",
 453                                 name, filter->elts[filter->n_elt].if_name);
 454 
 455             ++filter->n_elt;
 456             continue;
 457         }
 458 
 459         /* Found a subnet notation.  Convert it to an IP
 460            address/netmask.  Get the prefix first. */
 461         argv_prefix = 0;
 462         tmp = strdup(argv[i]);
 463         str = strchr(argv[i], '/');
 464         if (NULL == str) {
 465             opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
 466                            true, name, opal_process_info.nodename,
 467                            tmp, "Invalid specification (missing \"/\")");
 468             free(tmp);
 469             continue;
 470         }
 471         *str = '\0';
 472         argv_prefix = atoi(str + 1);
 473         if (argv_prefix < 1 || argv_prefix > 32) {
 474             opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
 475                            true, name, opal_process_info.nodename,
 476                            tmp, "Invalid specification (prefix < 1 or prefix >32)");
 477             free(tmp);
 478             continue;
 479         }
 480 
 481         /* Now convert the IPv4 address */
 482         ((struct sockaddr*) &argv_inaddr)->sa_family = AF_INET;
 483         ret = inet_pton(AF_INET, argv[i],
 484                         &((struct sockaddr_in*) &argv_inaddr)->sin_addr);
 485         if (1 != ret) {
 486             opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
 487                            true, name, opal_process_info.nodename, tmp,
 488                            "Invalid specification (inet_pton() failed)");
 489             free(tmp);
 490             continue;
 491         }
 492         opal_output_verbose(20, USNIC_OUT,
 493                             "btl:usnic:parse_ifex_str: parsed %s address+prefix: %s / %u",
 494                             name,
 495                             opal_net_get_hostname((struct sockaddr*) &argv_inaddr),
 496                             argv_prefix);
 497 
 498         memcpy(&addr,
 499                &((struct sockaddr_in*) &argv_inaddr)->sin_addr,
 500                sizeof(addr));
 501 
 502         /* be helpful: if the user passed A.B.C.D/24 instead of A.B.C.0/24,
 503          * also normalize the netmask */
 504         filter->elts[filter->n_elt].is_netmask = true;
 505         filter->elts[filter->n_elt].if_name = NULL;
 506         filter->elts[filter->n_elt].netmask_be =
 507             usnic_cidrlen_to_netmask(argv_prefix);
 508         filter->elts[filter->n_elt].addr_be = addr &
 509             filter->elts[filter->n_elt].netmask_be;
 510         ++filter->n_elt;
 511 
 512         free(tmp);
 513     }
 514     assert(i == n_argv); /* sanity */
 515 
 516     opal_argv_free(argv);
 517 
 518     /* don't return an empty filter */
 519     if (filter->n_elt == 0) {
 520         free_filter(filter);
 521         return NULL;
 522     }
 523 
 524     return filter;
 525 }
 526 
 527 /*
 528  * Check this module to see if should be kept or not.
 529  */
 530 static bool filter_module(opal_btl_usnic_module_t *module,
 531                           usnic_if_filter_t *filter,
 532                           bool filter_incl)
 533 {
 534     int i;
 535     uint32_t module_mask;
 536     struct sockaddr_in *src;
 537     struct fi_usnic_info *uip;
 538     struct fi_info *info;
 539     bool match;
 540     const char *linux_device_name;
 541 
 542     info = module->fabric_info;
 543     uip = &module->usnic_info;
 544     src = info->src_addr;
 545     linux_device_name = module->linux_device_name;
 546     module_mask = src->sin_addr.s_addr & uip->ui.v1.ui_netmask_be;
 547     match = false;
 548     for (i = 0; i < filter->n_elt; ++i) {
 549         if (filter->elts[i].is_netmask) {
 550             /* conservative: we also require the netmask to match */
 551             if (filter->elts[i].netmask_be == uip->ui.v1.ui_netmask_be &&
 552                 filter->elts[i].addr_be == module_mask) {
 553                 match = true;
 554                 break;
 555             }
 556         }
 557         else {
 558             if (strcmp(filter->elts[i].if_name, linux_device_name) == 0) {
 559                 match = true;
 560                 break;
 561             }
 562         }
 563     }
 564 
 565     /* Turn the match result into whether we should keep it or not */
 566     return match ^ !filter_incl;
 567 }
 568 
 569 /* utility routine to safely free a filter element array */
 570 static void free_filter(usnic_if_filter_t *filter)
 571 {
 572     int i;
 573 
 574     if (filter == NULL) {
 575         return;
 576     }
 577 
 578     if (NULL != filter->elts) {
 579         for (i = 0; i < filter->n_elt; ++i) {
 580             if (!filter->elts[i].is_netmask) {
 581                 free(filter->elts[i].if_name);
 582             }
 583         }
 584         free(filter->elts);
 585     }
 586     free(filter);
 587 }
 588 
 589 /*
 590  *  UD component initialization:
 591  *  (1) read interface list from kernel and compare against component
 592  *      parameters then create a BTL instance for selected interfaces
 593  *  (2) post OOB receive for incoming connection attempts
 594  *  (3) register BTL parameters with the MCA
 595  */
 596 static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
 597                                                     bool want_progress_threads,
 598                                                     bool want_mpi_threads)
 599 {
 600     mca_btl_base_module_t **btls = NULL;
 601     int i, j, num_final_modules;
 602     int num_devs;
 603     opal_btl_usnic_module_t *module;
 604     usnic_if_filter_t *filter = NULL;
 605     bool keep_module;
 606     bool filter_incl = false;
 607     int min_distance, num_local_procs;
 608     struct fi_info *info_list;
 609     struct fi_info *info;
 610     struct fid_fabric *fabric;
 611     struct fid_domain *domain;
 612     int ret;
 613 
 614     *num_btl_modules = 0;
 615 
 616     /* MPI_THREAD_MULTIPLE is only supported in 2.0+ */
 617     if (want_mpi_threads && !mca_btl_base_thread_multiple_override) {
 618         if (OPAL_MAJOR_VERSION >= 2) {
 619             opal_output_verbose(5, USNIC_OUT,
 620                                 "btl:usnic: MPI_THREAD_MULTIPLE support is in testing phase.");
 621         }
 622         else {
 623             opal_output_verbose(5, USNIC_OUT,
 624                                 "btl:usnic: MPI_THREAD_MULTIPLE is not supported in version < 2.");
 625             return NULL;
 626         }
 627     }
 628 
 629     OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t);
 630 
 631     /* There are multiple dimensions to consider when requesting an
 632        API version number from libfabric:
 633 
 634        1. This code understands libfabric API versions v1.3 through
 635           v1.4.
 636 
 637        2. Open MPI may be *compiled* against one version of libfabric,
 638           but may be *running* with another.
 639 
 640        3. There were usnic-specific bugs in Libfabric prior to
 641           libfabric v1.3.0 (where "v1.3.0" is the tarball/package
 642           version, not the API version; but happily, the API version
 643           was also 1.3 in Libfabric v1.3.0):
 644 
 645           - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider
 646             did not check the value of the "version" parameter passed
 647             into fi_getinfo()
 648           - If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API
 649             v1.1), the usnic provider will disable FI_MSG_PREFIX
 650             support (on the assumption that the application will not
 651             handle FI_MSG_PREFIX properly).  This can happen if you
 652             compile OMPI against libfabric v1.0.0 (i.e., API v1.0) and
 653             run OMPI against libfabric v1.1.0 (i.e., API v1.1).
 654           - Some critical AV bug fixes were included in libfabric
 655             v1.3.0; prior versions can fail in fi_av_* operations in
 656             unexpected ways (libnl: you win again!).
 657 
 658        So always request a minimum API version of v1.3.
 659 
 660        Note that the FI_MAJOR_VERSION and FI_MINOR_VERSION in
 661        <rdma/fabric.h> represent the API version, not the Libfabric
 662        package (i.e., tarball) version.  As of Libfabric v1.3, there
 663        is currently no way to know a) what package version of
 664        Libfabric you were compiled against, and b) what package
 665        version of Libfabric you are running with.
 666 
 667        Also note that the usnic provider changed the strings in the
 668        fabric and domain names in API v1.4.  With API <= v1.3:
 669 
 670        - fabric name is "usnic_X" (device name)
 671        - domain name is NULL
 672 
 673        With libfabric API >= v1.4, all Libfabric IP-based providers
 674        (including usnic) follow the same convention:
 675 
 676        - fabric name is "a.b.c.d/e" (CIDR notation of network)
 677        - domain name is "usnic_X" (device name)
 678 
 679        NOTE: The configure.m4 in this component will require libfabric
 680        >= v1.1.0 (i.e., it won't accept v1.0.0) because it needs
 681        access to the usNIC extension header structures that only
 682        became available in v1.1.0.*/
 683 
 684     /* First, check to see if the libfabric we are running with is <=
 685        libfabric v1.3.  If so, don't bother going further. */
 686     uint32_t libfabric_api;
 687     libfabric_api = fi_version();
 688     if (libfabric_api < FI_VERSION(1, 3)) {
 689         opal_output_verbose(5, USNIC_OUT,
 690                             "btl:usnic: disqualifiying myself because Libfabric does not support v1.3 of the API (v1.3 is *required* for correct usNIC functionality).");
 691         return NULL;
 692     }
 693 
 694     /* Libfabric API 1.3 is fine.  Above that, we know that Open MPI
 695        works with libfabric API v1.4, so just use that. */
 696     if (libfabric_api > FI_VERSION(1, 3)) {
 697         libfabric_api = FI_VERSION(1, 4);
 698     }
 699 
 700     struct fi_info hints = {0};
 701     struct fi_ep_attr ep_attr = {0};
 702     struct fi_fabric_attr fabric_attr = {0};
 703     struct fi_rx_attr rx_attr = {0};
 704     struct fi_tx_attr tx_attr = {0};
 705 
 706     /* We only want providers named "usnic" that are of type EP_DGRAM */
 707     fabric_attr.prov_name = "usnic";
 708     ep_attr.type = FI_EP_DGRAM;
 709 
 710     hints.caps = FI_MSG;
 711     hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
 712     hints.addr_format = FI_SOCKADDR;
 713     hints.ep_attr = &ep_attr;
 714     hints.fabric_attr = &fabric_attr;
 715     hints.tx_attr = &tx_attr;
 716     hints.rx_attr = &rx_attr;
 717 
 718     tx_attr.iov_limit = 1;
 719     rx_attr.iov_limit = 1;
 720 
 721     ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list);
 722     if (0 != ret) {
 723         opal_output_verbose(5, USNIC_OUT,
 724                             "btl:usnic: disqualifiying myself due to fi_getinfo(3) failure: %s (%d)", strerror(-ret), ret);
 725         return NULL;
 726     }
 727 
 728     num_devs = 0;
 729     for (info = info_list; NULL != info; info = info->next) {
 730         ++num_devs;
 731     }
 732     if (0 == num_devs) {
 733         opal_output_verbose(5, USNIC_OUT,
 734             "btl:usnic: disqualifiying myself due to lack of libfabric providers");
 735         return NULL;
 736     }
 737 
 738     /* Do quick sanity check to ensure that we can lock memory (which
 739        is required for registered memory). */
 740     if (OPAL_SUCCESS != check_reg_mem_basics()) {
 741         opal_output_verbose(5, USNIC_OUT,
 742                             "btl:usnic: disqualifiying myself due to lack of lockable memory");
 743         return NULL;
 744     }
 745 
 746     /************************************************************************
 747      * Below this line, we assume that usnic is loaded on all procs,
 748      * and therefore we will guarantee to the the modex send, even if
 749      * we fail.
 750      ************************************************************************/
 751 
 752     opal_output_verbose(5, USNIC_OUT,
 753                         "btl:usnic: usNIC fabrics found");
 754 
 755     opal_proc_t *me = opal_proc_local_get();
 756     opal_process_name_t *name = &(me->proc_name);
 757     mca_btl_usnic_component.my_hashed_rte_name =
 758         usnic_compat_rte_hash_name(name);
 759     MSGDEBUG1_OUT("%s: my_hashed_rte_name=0x%" PRIx64,
 760                    __func__, mca_btl_usnic_component.my_hashed_rte_name);
 761 
 762     opal_srand(&opal_btl_usnic_rand_buff, ((uint32_t) getpid()));
 763 
 764     /* Setup an array of pointers to point to each module (which we'll
 765        return upstream) */
 766     mca_btl_usnic_component.num_modules = num_devs;
 767     btls = (struct mca_btl_base_module_t**)
 768         malloc(mca_btl_usnic_component.num_modules *
 769                sizeof(opal_btl_usnic_module_t*));
 770     if (NULL == btls) {
 771         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 772         goto send_modex;
 773     }
 774 
 775     /* Allocate space for btl module instances */
 776     mca_btl_usnic_component.usnic_all_modules =
 777         calloc(mca_btl_usnic_component.num_modules,
 778                sizeof(*mca_btl_usnic_component.usnic_all_modules));
 779     mca_btl_usnic_component.usnic_active_modules =
 780         calloc(mca_btl_usnic_component.num_modules,
 781                sizeof(*mca_btl_usnic_component.usnic_active_modules));
 782     if (NULL == mca_btl_usnic_component.usnic_all_modules ||
 783         NULL == mca_btl_usnic_component.usnic_active_modules) {
 784         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 785         goto error;
 786     }
 787 
 788     /* If we have include or exclude list, parse and set up now
 789      * (higher level guarantees there will not be both include and exclude,
 790      * so don't bother checking that here)
 791      */
 792     if (NULL != mca_btl_usnic_component.if_include) {
 793         opal_output_verbose(20, USNIC_OUT,
 794                             "btl:usnic:filter_module: if_include=%s",
 795                             mca_btl_usnic_component.if_include);
 796 
 797         filter_incl = true;
 798         filter = parse_ifex_str(mca_btl_usnic_component.if_include, "include");
 799     } else if (NULL != mca_btl_usnic_component.if_exclude) {
 800         opal_output_verbose(20, USNIC_OUT,
 801                             "btl:usnic:filter_module: if_exclude=%s",
 802                             mca_btl_usnic_component.if_exclude);
 803 
 804         filter_incl = false;
 805         filter = parse_ifex_str(mca_btl_usnic_component.if_exclude, "exclude");
 806     }
 807 
 808     num_local_procs = opal_process_info.num_local_peers;
 809 
 810     /* Go through the list of devices and determine if we want it or
 811        not.  Create a module for each one that we want. */
 812     info = info_list;
 813     for (j = i = 0; i < num_devs &&
 814              (0 == mca_btl_usnic_component.max_modules ||
 815               i < mca_btl_usnic_component.max_modules);
 816              ++i, info = info->next) {
 817 
 818         // The fabric/domain names changed at libfabric API v1.4 (see above).
 819         char *linux_device_name;
 820         if (libfabric_api <= FI_VERSION(1, 3)) {
 821             linux_device_name = info->fabric_attr->name;
 822         } else {
 823             linux_device_name = info->domain_attr->name;
 824         }
 825 
 826         ret = fi_fabric(info->fabric_attr, &fabric, NULL);
 827         if (0 != ret) {
 828             opal_show_help("help-mpi-btl-usnic.txt",
 829                            "libfabric API failed",
 830                            true,
 831                            opal_process_info.nodename,
 832                            linux_device_name,
 833                            "fi_fabric()", __FILE__, __LINE__,
 834                            ret,
 835                            strerror(-ret));
 836             continue;
 837         }
 838         opal_memchecker_base_mem_defined(&fabric, sizeof(fabric));
 839 
 840         ret = fi_domain(fabric, info, &domain, NULL);
 841         if (0 != ret) {
 842             opal_show_help("help-mpi-btl-usnic.txt",
 843                            "libfabric API failed",
 844                            true,
 845                            opal_process_info.nodename,
 846                            linux_device_name,
 847                            "fi_domain()", __FILE__, __LINE__,
 848                            ret,
 849                            strerror(-ret));
 850             continue;
 851         }
 852         opal_memchecker_base_mem_defined(&domain, sizeof(domain));
 853 
 854         opal_output_verbose(5, USNIC_OUT,
 855                             "btl:usnic: found: usNIC device %s",
 856                             linux_device_name);
 857 
 858         /* Save a little info on the module that we have already
 859            gathered.  The rest of the module will be filled in
 860            later. */
 861         module = &(mca_btl_usnic_component.usnic_all_modules[j]);
 862         memcpy(module, &opal_btl_usnic_module_template,
 863                sizeof(opal_btl_usnic_module_t));
 864         module->fabric = fabric;
 865         module->domain = domain;
 866         module->fabric_info = info;
 867         module->libfabric_api = libfabric_api;
 868         module->linux_device_name = strdup(linux_device_name);
 869         if (NULL == module->linux_device_name) {
 870             OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
 871             goto error;
 872         }
 873 
 874         /* Obtain usnic-specific device info (e.g., netmask) that
 875            doesn't come in the normal fi_getinfo(). This allows us to
 876            do filtering, later. */
 877         ret = fi_open_ops(&fabric->fid, FI_USNIC_FABRIC_OPS_1, 0,
 878                 (void **)&module->usnic_fabric_ops, NULL);
 879         if (ret != 0) {
 880             opal_output_verbose(5, USNIC_OUT,
 881                         "btl:usnic: device %s fabric_open_ops failed %d (%s)",
 882                         module->linux_device_name, ret, fi_strerror(-ret));
 883             fi_close(&domain->fid);
 884             fi_close(&fabric->fid);
 885             continue;
 886         }
 887 
 888         ret =
 889             module->usnic_fabric_ops->getinfo(1,
 890                                             fabric,
 891                                             &module->usnic_info);
 892         if (ret != 0) {
 893             opal_output_verbose(5, USNIC_OUT,
 894                         "btl:usnic: device %s usnic_getinfo failed %d (%s)",
 895                         module->linux_device_name, ret, fi_strerror(-ret));
 896             fi_close(&domain->fid);
 897             fi_close(&fabric->fid);
 898             continue;
 899         }
 900         opal_output_verbose(5, USNIC_OUT,
 901                             "btl:usnic: device %s usnic_info: link speed=%d, netmask=0x%x, ifname=%s, num_vf=%d, qp/vf=%d, cq/vf=%d",
 902                             module->linux_device_name,
 903                             (unsigned int) module->usnic_info.ui.v1.ui_link_speed,
 904                             (unsigned int) module->usnic_info.ui.v1.ui_netmask_be,
 905                             module->usnic_info.ui.v1.ui_ifname,
 906                             module->usnic_info.ui.v1.ui_num_vf,
 907                             module->usnic_info.ui.v1.ui_qp_per_vf,
 908                             module->usnic_info.ui.v1.ui_cq_per_vf);
 909 
 910         /* respect if_include/if_exclude subnets/ifaces from the user */
 911         if (filter != NULL) {
 912             keep_module = filter_module(module, filter, filter_incl);
 913             opal_output_verbose(5, USNIC_OUT,
 914                                 "btl:usnic: %s %s due to %s",
 915                                 (keep_module ? "keeping" : "skipping"),
 916                                 module->linux_device_name,
 917                                 (filter_incl ? "if_include" : "if_exclude"));
 918             if (!keep_module) {
 919                 fi_close(&domain->fid);
 920                 fi_close(&fabric->fid);
 921                 continue;
 922             }
 923         }
 924 
 925         /* The first time through, check some usNIC configuration
 926            minimum settings with information we got back from the fi_*
 927            probes (these are VIC-wide settings -- they don't change
 928            for each module we create, so we only need to check
 929            once). */
 930         if (0 == j &&
 931             check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) {
 932             opal_output_verbose(5, USNIC_OUT,
 933                                 "btl:usnic: device %s is not provisioned with enough resources -- skipping",
 934                                 module->linux_device_name);
 935             fi_close(&domain->fid);
 936             fi_close(&fabric->fid);
 937 
 938             mca_btl_usnic_component.num_modules = 0;
 939             goto error;
 940         }
 941 
 942         /*************************************************/
 943         /* Below this point, we know we want this device */
 944         /*************************************************/
 945 
 946         opal_output_verbose(5, USNIC_OUT,
 947                             "btl:usnic: device %s looks good!",
 948                             module->linux_device_name);
 949 
 950         /* Let this module advance to the next round! */
 951         btls[j++] = &(module->super);
 952     }
 953     mca_btl_usnic_component.num_modules = j;
 954 
 955     /* free filter if created */
 956     if (filter != NULL) {
 957         free_filter(filter);
 958         filter = NULL;
 959     }
 960 
 961     /* If we actually have some modules, setup the connectivity
 962        checking agent and client. */
 963     if (mca_btl_usnic_component.num_modules > 0 &&
 964         mca_btl_usnic_component.connectivity_enabled) {
 965         mca_btl_usnic_component.opal_evbase = opal_progress_thread_init(NULL);
 966         if (OPAL_SUCCESS != opal_btl_usnic_connectivity_agent_init() ||
 967             OPAL_SUCCESS != opal_btl_usnic_connectivity_client_init()) {
 968             opal_progress_thread_finalize(NULL);
 969             return NULL;
 970         }
 971     }
 972 
 973     /* Now that we know how many modules there are, let the modules
 974        initialize themselves (it's useful to know how many modules
 975        there are before doing this). */
 976     for (num_final_modules = i = 0;
 977          i < mca_btl_usnic_component.num_modules; ++i) {
 978         module = (opal_btl_usnic_module_t*) btls[i];
 979 
 980         /* Let the module initialize itself */
 981         if (OPAL_SUCCESS != opal_btl_usnic_module_init(module)) {
 982             opal_output_verbose(5, USNIC_OUT,
 983                                 "btl:usnic: failed to init module for %s",
 984                                 module->if_ipv4_addr_str);
 985             continue;
 986         }
 987 
 988         /*************************************************/
 989         /* Below this point, we know we want this module */
 990         /*************************************************/
 991 
 992         /* If module_init() failed for any prior module, this will be
 993            a down shift in the btls[] array.  Otherwise, it's an
 994            overwrite of the same value. */
 995         btls[num_final_modules++] = &(module->super);
 996 
 997         /* Output all of this module's values. */
 998         const char *devname = module->linux_device_name;
 999         opal_output_verbose(5, USNIC_OUT,
1000                             "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
1001                             devname,
1002                             module->sd_num,
1003                             module->rd_num,
1004                             module->cq_num,
1005                             module->av_eq_num);
1006         opal_output_verbose(5, USNIC_OUT,
1007                             "btl:usnic: %s priority MTU = %" PRIsize_t,
1008                             devname,
1009                             module->max_tiny_msg_size);
1010         opal_output_verbose(5, USNIC_OUT,
1011                             "btl:usnic: %s priority limit = %" PRIsize_t,
1012                             devname,
1013                             module->max_tiny_payload);
1014         opal_output_verbose(5, USNIC_OUT,
1015                             "btl:usnic: %s eager limit = %" PRIsize_t,
1016                             devname,
1017                             module->super.btl_eager_limit);
1018         opal_output_verbose(5, USNIC_OUT,
1019                             "btl:usnic: %s eager rndv limit = %" PRIsize_t,
1020                             devname,
1021                             module->super.btl_rndv_eager_limit);
1022         opal_output_verbose(5, USNIC_OUT,
1023                             "btl:usnic: %s max send size= %" PRIsize_t
1024                             " (not overrideable)",
1025                             devname,
1026                             module->super.btl_max_send_size);
1027         opal_output_verbose(5, USNIC_OUT,
1028                             "btl:usnic: %s exclusivity = %d",
1029                             devname,
1030                             module->super.btl_exclusivity);
1031     }
1032 
1033     /* We may have skipped some modules, so reset
1034        component.num_modules */
1035     mca_btl_usnic_component.num_modules = num_final_modules;
1036 
1037     /* We've packed all the modules and pointers to those modules in
1038        the lower ends of their respective arrays.  If not all the
1039        modules initialized successfully, we're wasting a little space.
1040        We could realloc and re-form the btls[] array, but it doesn't
1041        seem worth it.  Just waste a little space.
1042 
1043        That being said, if we ended up with zero acceptable devices,
1044        then free everything. */
1045     if (0 == num_final_modules) {
1046         opal_output_verbose(5, USNIC_OUT,
1047                             "btl:usnic: returning 0 modules");
1048         goto error;
1049     }
1050 
1051     /* we have a nonzero number of modules, so save a copy of the btls array
1052      * for later use */
1053     memcpy(mca_btl_usnic_component.usnic_active_modules, btls,
1054            num_final_modules * sizeof(*btls));
1055 
1056     /* Loop over the modules and find the minimum value for
1057        module->numa_distance.  For every module that has a
1058        numa_distance higher than the minimum value, increase its btl
1059        latency rating so that the PML will prefer to send short
1060        messages over "near" modules. */
1061     min_distance = 9999999;
1062     for (i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
1063         module = (opal_btl_usnic_module_t*) btls[i];
1064         if (module->numa_distance < min_distance) {
1065             min_distance = module->numa_distance;
1066         }
1067     }
1068     for (i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
1069         module = (opal_btl_usnic_module_t*) btls[i];
1070         if (module->numa_distance > min_distance) {
1071             ++module->super.btl_latency;
1072             opal_output_verbose(5, USNIC_OUT,
1073                                 "btl:usnic: %s is far from me; increasing latency rating",
1074                                 module->if_ipv4_addr_str);
1075         }
1076     }
1077 
1078     /* start timer to guarantee synthetic clock advances */
1079     opal_event_set(opal_sync_event_base, &usnic_clock_timer_event,
1080                    -1, 0, usnic_clock_callback,
1081                    &usnic_clock_timeout);
1082     usnic_clock_timer_event_set = true;
1083 
1084     /* 1ms timer */
1085     usnic_clock_timeout.tv_sec = 0;
1086     usnic_clock_timeout.tv_usec = 1000;
1087     opal_event_add(&usnic_clock_timer_event, &usnic_clock_timeout);
1088 
1089     /* Setup MPI_T performance variables */
1090     opal_btl_usnic_setup_mpit_pvars();
1091 
1092     /* All done */
1093     *num_btl_modules = mca_btl_usnic_component.num_modules;
1094     opal_output_verbose(5, USNIC_OUT,
1095                         "btl:usnic: returning %d modules", *num_btl_modules);
1096 
1097  send_modex:
1098     usnic_modex_send();
1099     return btls;
1100 
1101  error:
1102     /* clean up as much allocated memory as possible */
1103     free(btls);
1104     btls = NULL;
1105     free(mca_btl_usnic_component.usnic_all_modules);
1106     mca_btl_usnic_component.usnic_all_modules = NULL;
1107     free(mca_btl_usnic_component.usnic_active_modules);
1108     mca_btl_usnic_component.usnic_active_modules = NULL;
1109 
1110     /* free filter if created */
1111     if (filter != NULL) {
1112         free_filter(filter);
1113         filter = NULL;
1114     }
1115 
1116     goto send_modex;
1117 }
1118 
1119 /*
1120  * Component progress
1121  * The fast-path of an incoming packet available on the priority
1122  * receive queue is handled directly in this routine, everything else
1123  * is deferred to an external call, usnic_component_progress_2()
1124  * This helps keep usnic_component_progress() very small and very responsive
1125  * to a single incoming packet.  We make sure not to always return
1126  * immediately after one packet to avoid starvation, "fastpath_ok" is
1127  * used for this.
1128  */
1129 static int usnic_handle_completion(opal_btl_usnic_module_t* module,
1130     opal_btl_usnic_channel_t *channel, struct fi_cq_entry *completion);
1131 static int usnic_component_progress_2(void);
1132 static void usnic_handle_cq_error(opal_btl_usnic_module_t* module,
1133     opal_btl_usnic_channel_t *channel, int cq_ret);
1134 
1135 static int usnic_component_progress(void)
1136 {
1137     int i;
1138     int count;
1139     opal_btl_usnic_recv_segment_t* rseg;
1140     opal_btl_usnic_module_t* module;
1141     struct fi_cq_entry completion;
1142     opal_btl_usnic_channel_t *channel;
1143     static bool fastpath_ok = true;
1144 
1145     /* update our simulated clock */
1146     opal_btl_usnic_ticks += 5000;
1147 
1148     count = 0;
1149     if (fastpath_ok) {
1150         for (i = 0; i < mca_btl_usnic_component.num_modules; i++) {
1151             module = mca_btl_usnic_component.usnic_active_modules[i];
1152             channel = &module->mod_channels[USNIC_PRIORITY_CHANNEL];
1153 
1154             assert(channel->chan_deferred_recv == NULL);
1155 
1156             int ret = fi_cq_read(channel->cq, &completion, 1);
1157             assert(0 != ret);
1158             if (OPAL_LIKELY(1 == ret)) {
1159                 opal_memchecker_base_mem_defined(&completion,
1160                                                  sizeof(completion));
1161                 rseg = (opal_btl_usnic_recv_segment_t*) completion.op_context;
1162                 if (OPAL_LIKELY(OPAL_BTL_USNIC_SEG_RECV ==
1163                             rseg->rs_base.us_type)) {
1164                     opal_btl_usnic_recv_fast(module, rseg, channel);
1165                     ++module->stats.num_seg_total_completions;
1166                     ++module->stats.num_seg_recv_completions;
1167                     fastpath_ok = false;    /* prevent starvation */
1168                     return 1;
1169                 } else {
1170                     count += usnic_handle_completion(module, channel,
1171                                                      &completion);
1172                 }
1173             } else if (OPAL_LIKELY(-FI_EAGAIN == ret)) {
1174                 continue;
1175             } else {
1176                 usnic_handle_cq_error(module, channel, ret);
1177             }
1178         }
1179     }
1180 
1181     fastpath_ok = true;
1182     return count + usnic_component_progress_2();
1183 }
1184 
1185 static int usnic_handle_completion(
1186     opal_btl_usnic_module_t* module,
1187     opal_btl_usnic_channel_t *channel,
1188     struct fi_cq_entry *completion)
1189 {
1190     opal_btl_usnic_segment_t* seg;
1191     opal_btl_usnic_recv_segment_t* rseg;
1192 
1193     seg = (opal_btl_usnic_segment_t*)completion->op_context;
1194     rseg = (opal_btl_usnic_recv_segment_t*)seg;
1195 
1196     ++module->stats.num_seg_total_completions;
1197 
1198     /* Make the completion be Valgrind-defined */
1199     opal_memchecker_base_mem_defined(seg, sizeof(*seg));
1200 
1201     OPAL_THREAD_LOCK(&btl_usnic_lock);
1202 
1203     /* Handle work completions */
1204     switch(seg->us_type) {
1205 
1206     /**** Send ACK completions ****/
1207     case OPAL_BTL_USNIC_SEG_ACK:
1208         ++module->stats.num_seg_ack_completions;
1209         opal_btl_usnic_ack_complete(module,
1210                 (opal_btl_usnic_ack_segment_t *)seg);
1211         break;
1212 
1213     /**** Send of frag segment completion (i.e., the MPI message's
1214           one-and-only segment has completed sending) ****/
1215     case OPAL_BTL_USNIC_SEG_FRAG:
1216         ++module->stats.num_seg_frag_completions;
1217         opal_btl_usnic_frag_send_complete(module,
1218                 (opal_btl_usnic_frag_segment_t*)seg);
1219         break;
1220 
1221     /**** Send of chunk segment completion (i.e., part of a large MPI
1222           message is done sending) ****/
1223     case OPAL_BTL_USNIC_SEG_CHUNK:
1224         ++module->stats.num_seg_chunk_completions;
1225         opal_btl_usnic_chunk_send_complete(module,
1226                 (opal_btl_usnic_chunk_segment_t*)seg);
1227         break;
1228 
1229     /**** Receive completions ****/
1230     case OPAL_BTL_USNIC_SEG_RECV:
1231         ++module->stats.num_seg_recv_completions;
1232         opal_btl_usnic_recv(module, rseg, channel);
1233         break;
1234 
1235     default:
1236         BTL_ERROR(("Unhandled completion segment type %d", seg->us_type));
1237         break;
1238     }
1239 
1240     OPAL_THREAD_UNLOCK(&btl_usnic_lock);
1241     return 1;
1242 }
1243 
1244 static void
1245 usnic_handle_cq_error(opal_btl_usnic_module_t* module,
1246     opal_btl_usnic_channel_t *channel, int cq_ret)
1247 {
1248     int rc;
1249     struct fi_cq_err_entry err_entry;
1250     opal_btl_usnic_recv_segment_t* rseg;
1251 
1252     if (cq_ret != -FI_EAVAIL) {
1253         BTL_ERROR(("%s: cq_read ret = %d (%s)",
1254                module->linux_device_name, cq_ret,
1255                fi_strerror(-cq_ret)));
1256         channel->chan_error = true;
1257     }
1258 
1259     rc = fi_cq_readerr(channel->cq, &err_entry, 0);
1260     if (rc == -FI_EAGAIN) {
1261         return;
1262     } else if (rc != 1) {
1263         BTL_ERROR(("%s: cq_readerr ret = %d (expected 1)",
1264                    module->linux_device_name, rc));
1265         channel->chan_error = true;
1266     }
1267 
1268     /* Silently count CRC errors.  Truncation errors are usually a
1269        different symptom of a CRC error. */
1270     else if (FI_ECRC == err_entry.prov_errno ||
1271              FI_ETRUNC == err_entry.prov_errno) {
1272 #if MSGDEBUG1
1273         static int once = 0;
1274         if (once++ == 0) {
1275             BTL_ERROR(("%s: Channel %d, %s",
1276                        module->linux_device_name,
1277                        channel->chan_index,
1278                        FI_ECRC == err_entry.prov_errno ?
1279                        "CRC error" : "message truncation"));
1280         }
1281 #endif
1282 
1283         /* silently count CRC errors */
1284         ++module->stats.num_crc_errors;
1285 
1286         /* repost segment */
1287         ++module->stats.num_recv_reposts;
1288 
1289         /* Add recv to linked list for reposting */
1290         rseg = err_entry.op_context;
1291         if (OPAL_BTL_USNIC_SEG_RECV == rseg->rs_base.us_type) {
1292             rseg->rs_next = channel->repost_recv_head;
1293             channel->repost_recv_head = rseg;
1294         }
1295     } else {
1296         BTL_ERROR(("%s: CQ[%d] prov_err = %d",
1297                    module->linux_device_name, channel->chan_index,
1298                    err_entry.prov_errno));
1299         channel->chan_error = true;
1300     }
1301 }
1302 
1303 static int usnic_component_progress_2(void)
1304 {
1305     int i, j, count = 0, num_events, ret;
1306     opal_btl_usnic_module_t* module;
1307     static struct fi_cq_entry completions[OPAL_BTL_USNIC_NUM_COMPLETIONS];
1308     opal_btl_usnic_channel_t *channel;
1309     int rc;
1310     int c;
1311 
1312     /* update our simulated clock */
1313     opal_btl_usnic_ticks += 5000;
1314 
1315     /* Poll for completions */
1316     for (i = 0; i < mca_btl_usnic_component.num_modules; i++) {
1317         module = mca_btl_usnic_component.usnic_active_modules[i];
1318 
1319         /* poll each channel */
1320         for (c=0; c<USNIC_NUM_CHANNELS; ++c) {
1321             channel = &module->mod_channels[c];
1322 
1323             if (channel->chan_deferred_recv != NULL) {
1324                 (void) opal_btl_usnic_recv_frag_bookkeeping(module,
1325                         channel->chan_deferred_recv, channel);
1326                 channel->chan_deferred_recv = NULL;
1327             }
1328 
1329             num_events = ret =
1330                 fi_cq_read(channel->cq, completions,
1331                            OPAL_BTL_USNIC_NUM_COMPLETIONS);
1332             assert(0 != ret);
1333             opal_memchecker_base_mem_defined(&ret, sizeof(ret));
1334             if (OPAL_UNLIKELY(ret < 0 && -FI_EAGAIN != ret)) {
1335                 usnic_handle_cq_error(module, channel, num_events);
1336                 num_events = 0;
1337             } else if (-FI_EAGAIN == ret) {
1338                 num_events = 0;
1339             }
1340 
1341             opal_memchecker_base_mem_defined(completions,
1342                                              sizeof(completions[0]) *
1343                                              num_events);
1344             /* Handle each event */
1345             for (j = 0; j < num_events; j++) {
1346                 count += usnic_handle_completion(module, channel,
1347                                                  &completions[j]);
1348             }
1349 
1350             /* return error if detected - this may be slightly deferred
1351              * since fastpath avoids the "if" of checking this.
1352              */
1353             if (channel->chan_error) {
1354                 channel->chan_error = false;
1355                 return OPAL_ERROR;
1356             }
1357 
1358             /* progress sends */
1359             opal_btl_usnic_module_progress_sends(module);
1360 
1361             /* Re-post all the remaining receive buffers */
1362             if (OPAL_LIKELY(NULL != channel->repost_recv_head)) {
1363                 rc = opal_btl_usnic_post_recv_list(channel);
1364                 if (OPAL_UNLIKELY(rc != 0)) {
1365                     BTL_ERROR(("error posting recv: %s\n", strerror(errno)));
1366                     return OPAL_ERROR;
1367                 }
1368             }
1369         }
1370     }
1371 
1372     return count;
1373 }
1374 
1375 /* could take indent as a parameter instead of hard-coding it */
1376 static void dump_endpoint(opal_btl_usnic_endpoint_t *endpoint)
1377 {
1378     int i;
1379     opal_btl_usnic_frag_t *frag;
1380     opal_btl_usnic_send_segment_t *sseg;
1381     struct in_addr ia;
1382     char ep_addr_str[INET_ADDRSTRLEN];
1383     char tmp[128], str[2048];
1384 
1385     memset(ep_addr_str, 0x00, sizeof(ep_addr_str));
1386     ia.s_addr = endpoint->endpoint_remote_modex.ipv4_addr;
1387     inet_ntop(AF_INET, &ia, ep_addr_str, sizeof(ep_addr_str));
1388 
1389     opal_output(0, "    endpoint %p, %s job=%u, rank=%u rts=%s s_credits=%"PRIi32"\n",
1390                 (void *)endpoint, ep_addr_str,
1391                 endpoint->endpoint_proc->proc_opal->proc_name.jobid,
1392                 endpoint->endpoint_proc->proc_opal->proc_name.vpid,
1393                 (endpoint->endpoint_ready_to_send ? "true" : "false"),
1394                 endpoint->endpoint_send_credits);
1395     opal_output(0, "      endpoint->frag_send_queue:\n");
1396 
1397     OPAL_LIST_FOREACH(frag, &endpoint->endpoint_frag_send_queue,
1398                       opal_btl_usnic_frag_t) {
1399         opal_btl_usnic_small_send_frag_t *ssfrag;
1400         opal_btl_usnic_large_send_frag_t *lsfrag;
1401 
1402         snprintf(str, sizeof(str), "      --> frag %p, %s", (void *)frag,
1403                  usnic_frag_type(frag->uf_type));
1404         switch (frag->uf_type) {
1405             case OPAL_BTL_USNIC_FRAG_LARGE_SEND:
1406                 lsfrag = (opal_btl_usnic_large_send_frag_t *)frag;
1407                 snprintf(tmp, sizeof(tmp), " tag=%"PRIu8" id=%"PRIu32" offset=%llu/%llu post_cnt=%"PRIu32" ack_bytes_left=%llu\n",
1408                         lsfrag->lsf_tag,
1409                         lsfrag->lsf_frag_id,
1410                         (unsigned long long)lsfrag->lsf_cur_offset,
1411                         (unsigned long long)lsfrag->lsf_base.sf_size,
1412                         lsfrag->lsf_base.sf_seg_post_cnt,
1413                         (unsigned long long)lsfrag->lsf_base.sf_ack_bytes_left);
1414                 strncat(str, tmp, sizeof(str) - strlen(str) - 1);
1415                 opal_output(0, "%s", str);
1416 
1417                 OPAL_LIST_FOREACH(sseg, &lsfrag->lsf_seg_chain,
1418                                   opal_btl_usnic_send_segment_t) {
1419                     /* chunk segs are just typedefs to send segs */
1420                     opal_output(0, "        chunk seg %p, chan=%s hotel=%d times_posted=%"PRIu32" pending=%s\n",
1421                                 (void *)sseg,
1422                                 (USNIC_PRIORITY_CHANNEL == sseg->ss_channel ?
1423                                 "prio" : "data"),
1424                                 sseg->ss_hotel_room,
1425                                 sseg->ss_send_posted,
1426                                 (sseg->ss_ack_pending ? "true" : "false"));
1427                 }
1428             break;
1429 
1430             case OPAL_BTL_USNIC_FRAG_SMALL_SEND:
1431                 ssfrag = (opal_btl_usnic_small_send_frag_t *)frag;
1432                 snprintf(tmp, sizeof(tmp), " sf_size=%llu post_cnt=%"PRIu32" ack_bytes_left=%llu\n",
1433                         (unsigned long long)ssfrag->ssf_base.sf_size,
1434                         ssfrag->ssf_base.sf_seg_post_cnt,
1435                         (unsigned long long)ssfrag->ssf_base.sf_ack_bytes_left);
1436                 strncat(str, tmp, sizeof(str) - strlen(str) - 1);
1437                 opal_output(0, "%s", str);
1438 
1439                 sseg = &ssfrag->ssf_segment;
1440                 opal_output(0, "        small seg %p, chan=%s hotel=%d times_posted=%"PRIu32" pending=%s\n",
1441                     (void *)sseg,
1442                     (USNIC_PRIORITY_CHANNEL == sseg->ss_channel ?
1443                         "prio" : "data"),
1444                     sseg->ss_hotel_room,
1445                     sseg->ss_send_posted,
1446                     (sseg->ss_ack_pending ? "true" : "false"));
1447             break;
1448 
1449             case OPAL_BTL_USNIC_FRAG_PUT_DEST:
1450                 /* put_dest frags are just a typedef to generic frags */
1451                 snprintf(tmp, sizeof(tmp), " put_addr=%p\n", frag->uf_remote_seg[0].seg_addr.pval);
1452                 strncat(str, tmp, sizeof(str) - strlen(str) - 1);
1453                 opal_output(0, "%s", str);
1454             break;
1455         }
1456     }
1457 
1458     /* Now examine the hotel for this endpoint and dump any segments we find
1459      * there.  Yes, this peeks at members that are technically "private", so
1460      * eventually this should be done through some sort of debug or iteration
1461      * interface in the hotel code. */
1462     opal_output(0, "      endpoint->endpoint_sent_segs (%p):\n",
1463            (void *)endpoint->endpoint_sent_segs);
1464     for (i = 0; i < WINDOW_SIZE; ++i) {
1465         sseg = endpoint->endpoint_sent_segs[i];
1466         if (NULL != sseg) {
1467             opal_output(0, "        [%d] sseg=%p %s chan=%s hotel=%d times_posted=%"PRIu32" pending=%s\n",
1468                    i,
1469                    (void *)sseg,
1470                    usnic_seg_type_str(sseg->ss_base.us_type),
1471                    (USNIC_PRIORITY_CHANNEL == sseg->ss_channel ?
1472                     "prio" : "data"),
1473                    sseg->ss_hotel_room,
1474                    sseg->ss_send_posted,
1475                    (sseg->ss_ack_pending ? "true" : "false"));
1476         }
1477     }
1478 
1479     opal_output(0, "      ack_needed=%s n_t=%"UDSEQ" n_a=%"UDSEQ" n_r=%"UDSEQ" n_s=%"UDSEQ" rfstart=%"PRIu32"\n",
1480                 (endpoint->endpoint_ack_needed?"true":"false"),
1481                 endpoint->endpoint_next_seq_to_send,
1482                 endpoint->endpoint_ack_seq_rcvd,
1483                 endpoint->endpoint_next_contig_seq_to_recv,
1484                 endpoint->endpoint_highest_seq_rcvd,
1485                 endpoint->endpoint_rfstart);
1486 
1487     if (dump_bitvectors) {
1488         opal_btl_usnic_snprintf_bool_array(str, sizeof(str),
1489                                            endpoint->endpoint_rcvd_segs,
1490                                            WINDOW_SIZE);
1491         opal_output(0, "      rcvd_segs 0x%s", str);
1492     }
1493 }
1494 
1495 void opal_btl_usnic_component_debug(void)
1496 {
1497     int i;
1498     opal_btl_usnic_module_t *module;
1499     opal_btl_usnic_endpoint_t *endpoint;
1500     opal_btl_usnic_send_segment_t *sseg;
1501     opal_list_item_t *item;
1502     const opal_proc_t *proc = opal_proc_local_get();
1503 
1504     opal_output(0, "*** dumping usnic state for MPI_COMM_WORLD rank %u ***\n",
1505                 proc->proc_name.vpid);
1506     for (i = 0; i < (int)mca_btl_usnic_component.num_modules; ++i) {
1507         module = mca_btl_usnic_component.usnic_active_modules[i];
1508 
1509         opal_output(0, "active_modules[%d]=%p %s max{frag,chunk,tiny}=%llu,%llu,%llu\n",
1510                i, (void *)module, module->linux_device_name,
1511                (unsigned long long)module->max_frag_payload,
1512                (unsigned long long)module->max_chunk_payload,
1513                (unsigned long long)module->max_tiny_payload);
1514 
1515         opal_output(0, "  endpoints_with_sends:\n");
1516         OPAL_LIST_FOREACH(endpoint, &module->endpoints_with_sends,
1517                           opal_btl_usnic_endpoint_t) {
1518             dump_endpoint(endpoint);
1519         }
1520 
1521         opal_output(0, "  endpoints_that_need_acks:\n");
1522         OPAL_LIST_FOREACH(endpoint, &module->endpoints_that_need_acks,
1523                           opal_btl_usnic_endpoint_t) {
1524             dump_endpoint(endpoint);
1525         }
1526 
1527         /* the all_endpoints list uses a different list item member */
1528         opal_output(0, "  all_endpoints:\n");
1529         opal_mutex_lock(&module->all_endpoints_lock);
1530         item = opal_list_get_first(&module->all_endpoints);
1531         while (item != opal_list_get_end(&module->all_endpoints)) {
1532             endpoint = container_of(item, mca_btl_base_endpoint_t,
1533                                     endpoint_endpoint_li);
1534             item = opal_list_get_next(item);
1535             dump_endpoint(endpoint);
1536         }
1537         opal_mutex_unlock(&module->all_endpoints_lock);
1538 
1539         opal_output(0, "  pending_resend_segs:\n");
1540         OPAL_LIST_FOREACH(sseg, &module->pending_resend_segs,
1541                           opal_btl_usnic_send_segment_t) {
1542             opal_output(0, "    sseg %p\n", (void *)sseg);
1543         }
1544 
1545         opal_btl_usnic_print_stats(module, "  manual", /*reset=*/false);
1546     }
1547 }
1548 
1549 #include "test/btl_usnic_component_test.h"

/* [<][>][^][v][top][bottom][index][help] */