root/opal/mca/btl/ugni/btl_ugni_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_btl_ugni_get_stat
  2. mca_btl_ugni_notify_stat
  3. btl_ugni_component_register
  4. btl_ugni_component_open
  5. btl_ugni_component_close
  6. mca_btl_ugni_component_init
  7. mca_btl_ugni_progress_datagram
  8. mca_btl_ugni_handle_rdma_completions
  9. mca_btl_ugni_progress_rdma
  10. mca_btl_ugni_progress_wait_list
  11. mca_btl_ugni_component_progress
  12. mca_btl_ugni_flush
  13. btl_ugni_dump_post_desc

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
   4  *                         reserved.
   5  * Copyright (c) 2011      UT-Battelle, LLC. All rights reserved.
   6  * Copyright (c) 2017      Intel, Inc.  All rights reserved.
   7  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 
  15 #include "btl_ugni.h"
  16 #include "btl_ugni_frag.h"
  17 #include "btl_ugni_rdma.h"
  18 #include "btl_ugni_smsg.h"
  19 
  20 #include "opal/util/sys_limits.h"
  21 #include "opal/util/printf.h"
  22 
  23 #include <stdlib.h>
  24 #include <fcntl.h>
  25 #include <ctype.h>
  26 
  27 #include "opal/memoryhooks/memory.h"
  28 #include "opal/runtime/opal_params.h"
  29 
  30 #include "opal/mca/base/mca_base_pvar.h"
  31 
  32 static int btl_ugni_component_register(void);
  33 static int btl_ugni_component_open(void);
  34 static int btl_ugni_component_close(void);
  35 static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
  36 static int mca_btl_ugni_component_progress(void);
  37 static unsigned long mca_btl_ugni_ugni_page_size = 0;
  38 
  39 mca_btl_ugni_component_t mca_btl_ugni_component = {
  40     .super = {
  41         /* First, the mca_base_component_t struct containing meta information
  42            about the component itself */
  43         .btl_version = {
  44             MCA_BTL_DEFAULT_VERSION("ugni"),
  45             .mca_open_component = btl_ugni_component_open,
  46             .mca_close_component = btl_ugni_component_close,
  47             .mca_register_component_params = btl_ugni_component_register,
  48         },
  49         .btl_data = {
  50             .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
  51         },
  52         .btl_init = mca_btl_ugni_component_init,
  53         .btl_progress = mca_btl_ugni_component_progress,
  54     }
  55 };
  56 
  57 mca_base_var_enum_value_t rcache_values[] = {
  58     {MCA_BTL_UGNI_RCACHE_UDREG, "udreg"},
  59     {MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"},
  60     {-1, NULL} /* sentinal */
  61 };
  62 
  63 mca_base_var_enum_value_flag_t cdm_flags[] = {
  64     {.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
  65     {.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
  66     {.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY},
  67     {.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL},
  68     {.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL},
  69     {.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0},
  70     {.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0},
  71     {.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0},
  72     {.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED},
  73     {.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED},
  74     {.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED},
  75     {.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED},
  76     {.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0},
  77     {.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0},
  78     {.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0},
  79     {.string = NULL}
  80 };
  81 
  82 static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj)
  83 {
  84     gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx;
  85     gni_return_t rc = GNI_RC_SUCCESS;
  86 
  87     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
  88         rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic,
  89                              ((unsigned int *) value) + i);
  90     }
  91 
  92     return mca_btl_rc_ugni_to_opal (rc);
  93 }
  94 
  95 static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count)
  96 {
  97     if (MCA_BASE_PVAR_HANDLE_BIND == event) {
  98         /* one value for each virtual device handle */
  99         *count = mca_btl_ugni_component.virtual_device_count;
 100     }
 101 
 102     return OPAL_SUCCESS;
 103 }
 104 
 105 static int btl_ugni_component_register(void)
 106 {
 107     mca_base_var_enum_t *new_enum;
 108     gni_nic_device_t device_type;
 109     char *mpool_hints_tmp = NULL;
 110     int rc;
 111 
 112     (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
 113                                                  "uGNI byte transport layer");
 114 
 115     mca_btl_ugni_component.ugni_free_list_num = 8;
 116     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 117                                            "free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
 118                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 119                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
 120                                            &mca_btl_ugni_component.ugni_free_list_num);
 121     mca_btl_ugni_component.ugni_free_list_max = 4096;
 122     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 123                                            "free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
 124                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 125                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
 126                                            &mca_btl_ugni_component.ugni_free_list_max);
 127     mca_btl_ugni_component.ugni_free_list_inc = 64;
 128     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 129                                            "free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
 130                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 131                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
 132                                            &mca_btl_ugni_component.ugni_free_list_inc);
 133 
 134     mca_btl_ugni_component.ugni_eager_num = 16;
 135     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 136                                            "eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
 137                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 138                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 139                                            &mca_btl_ugni_component.ugni_eager_num);
 140     mca_btl_ugni_component.ugni_eager_max = 128;
 141     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 142                                            "eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
 143                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 144                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 145                                            &mca_btl_ugni_component.ugni_eager_max);
 146     mca_btl_ugni_component.ugni_eager_inc = 16;
 147     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 148                                            "eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
 149                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 150                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 151                                            &mca_btl_ugni_component.ugni_eager_inc);
 152 
 153     mca_btl_ugni_component.remote_cq_size = 40000;
 154     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 155                                            "remote_cq_size", "Remote SMSG completion queue "
 156                                            "size (default 40000)", MCA_BASE_VAR_TYPE_INT,
 157                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 158                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 159                                            &mca_btl_ugni_component.remote_cq_size);
 160 
 161     mca_btl_ugni_component.local_cq_size = 8192;
 162     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 163                                            "local_cq_size", "Local SMSG completion queue size "
 164                                            "(default 8k)", MCA_BASE_VAR_TYPE_INT,
 165                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 166                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 167                                            &mca_btl_ugni_component.local_cq_size);
 168 
 169     mca_btl_ugni_component.local_rdma_cq_size = 1024;
 170     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 171                                            "local_rdma_cq_size", "Local FMA/RDMA completion queue size "
 172                                            "(default: 1024)",MCA_BASE_VAR_TYPE_INT, NULL, 0,
 173                                            MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
 174                                            MCA_BASE_VAR_SCOPE_LOCAL,
 175                                            &mca_btl_ugni_component.local_rdma_cq_size);
 176 
 177     mca_btl_ugni_component.ugni_smsg_limit = 0;
 178     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 179                                            "smsg_limit", "Maximum size message that "
 180                                            "will be sent using the SMSG/MSGQ protocol "
 181                                            "(0 - autoselect(default), 16k max)",
 182                                            MCA_BASE_VAR_TYPE_INT, NULL, 0,
 183                                            MCA_BASE_VAR_FLAG_SETTABLE,
 184                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 185                                            &mca_btl_ugni_component.ugni_smsg_limit);
 186 
 187     mca_btl_ugni_component.smsg_max_credits = 32;
 188     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 189                                            "smsg_max_credits", "Maximum number of "
 190                                            "outstanding SMSG/MSGQ message (default 32)",
 191                                            MCA_BASE_VAR_TYPE_INT, NULL, 0,
 192                                            MCA_BASE_VAR_FLAG_SETTABLE,
 193                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 194                                            &mca_btl_ugni_component.smsg_max_credits);
 195 
 196 #if OPAL_C_HAVE__THREAD_LOCAL
 197     mca_btl_ugni_component.bind_threads_to_devices = true;
 198 
 199     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 200                                            "bind_devices", "Bind threads to virtual "
 201                                            "devices. In general this should improve "
 202                                            "RDMA performance (default: true)",
 203                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
 204                                            MCA_BASE_VAR_FLAG_SETTABLE,
 205                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 206                                            &mca_btl_ugni_component.bind_threads_to_devices);
 207 #endif
 208 
 209     mca_btl_ugni_component.ugni_fma_limit = -1;
 210     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 211                                            "fma_limit", "Default maximum size message that "
 212                                            "will be sent using the FMA (Fast Memory "
 213                                            "Access) protocol (default: -1 (don't use), 64k max)",
 214                                            MCA_BASE_VAR_TYPE_LONG, NULL, 0,
 215                                            MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED,
 216                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 217                                            &mca_btl_ugni_component.ugni_fma_limit);
 218 
 219     mca_btl_ugni_component.ugni_fma_get_limit = 2048;
 220     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 221                                            "fma_get_limit", "Maximum size message that "
 222                                            "will be sent using the FMA (Fast Memory "
 223                                            "Access) protocol for get (default 2k, "
 224                                            "64k max)",
 225                                            MCA_BASE_VAR_TYPE_LONG, NULL, 0,
 226                                            MCA_BASE_VAR_FLAG_SETTABLE,
 227                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 228                                            &mca_btl_ugni_component.ugni_fma_get_limit);
 229 
 230     mca_btl_ugni_component.ugni_fma_put_limit = 4096;
 231     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 232                                            "fma_put_limit", "Maximum size message that "
 233                                            "will be sent using the FMA (Fast Memory "
 234                                            "Access) protocol for put (default: 4k, "
 235                                            "64k max)",
 236                                            MCA_BASE_VAR_TYPE_LONG, NULL, 0,
 237                                            MCA_BASE_VAR_FLAG_SETTABLE,
 238                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 239                                            &mca_btl_ugni_component.ugni_fma_put_limit);
 240 
 241     mca_btl_ugni_component.rdma_max_retries = 16;
 242     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 243                                            "rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
 244                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 245                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 246                                            &mca_btl_ugni_component.rdma_max_retries);
 247 
 248     mca_btl_ugni_component.smsg_max_retries = 16;
 249     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 250                                            "smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
 251                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 252                                            OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
 253                                            &mca_btl_ugni_component.smsg_max_retries);
 254 
 255     mca_btl_ugni_component.max_mem_reg = 0;
 256     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 257                                            "max_mem_reg", "Maximum number of "
 258                                            "memory registrations a process can "
 259                                            "hold (0 - autoselect, -1 - unlimited)"
 260                                            " (default 0)", MCA_BASE_VAR_TYPE_INT,
 261                                            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
 262                                            OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
 263                                            &mca_btl_ugni_component.max_mem_reg);
 264 
 265     mca_btl_ugni_component.mbox_increment = 0;
 266     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 267                                            "mbox_inc", "Number of SMSG mailboxes to "
 268                                            "allocate in each block (0 - autoselect(default))",
 269                                            MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
 270                                            MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 271                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
 272 
 273     /* communication domain flags */
 274     rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum);
 275     if (OPAL_SUCCESS != rc) {
 276         return rc;
 277     }
 278 
 279     mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
 280         GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
 281     mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 282                                            "cdm_flags", "Flags to set when creating a communication domain "
 283                                            " (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
 284                                            "fma-shared,fma-small-window)",
 285                                            MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
 286                                            MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 287                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags);
 288     OBJ_RELEASE(new_enum);
 289 
 290     mca_btl_ugni_component.virtual_device_count = 0;
 291     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 292                                            "virtual_device_count", "Number of virtual devices to create. Higher numbers may "
 293                                            "result in better performance when using threads. (default: 0 (auto), max: 128)",
 294                                            MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
 295                                            MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 296                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
 297 
 298     /* determine if there are get alignment restrictions */
 299     GNI_GetDeviceType (&device_type);
 300 
 301 
 302     mca_btl_ugni_component.smsg_page_size = 2 << 20;
 303     if (GNI_DEVICE_GEMINI == device_type) {
 304         if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) {
 305             int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY);
 306             char buffer[10];
 307 
 308             if (0 <= fd) {
 309                 memset (buffer, 0, sizeof (buffer));
 310                 read (fd, buffer, sizeof (buffer) - 1);
 311                 close (fd);
 312                 mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024;
 313                 mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
 314             }
 315         }
 316     }
 317 
 318     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 319                                            "smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)",
 320                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 321                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size);
 322 
 323     mca_btl_ugni_component.progress_thread_requested = 0;
 324     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 325                                            "request_progress_thread",
 326                                            "Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support",
 327                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
 328                                            MCA_BASE_VAR_FLAG_SETTABLE,
 329                                            OPAL_INFO_LVL_3,
 330                                            MCA_BASE_VAR_SCOPE_LOCAL,
 331                                            &mca_btl_ugni_component.progress_thread_requested);
 332 
 333     /* performance variables */
 334     mca_btl_ugni_progress_thread_wakeups = 0;
 335     (void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
 336                                             "progress_thread_wakeups", "Number of times the progress thread "
 337                                             "has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
 338                                             MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
 339                                             MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
 340                                             NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
 341 
 342     /* register network statistics as performance variables */
 343     for (int i = 0 ; i < GNI_NUM_STATS ; ++i) {
 344         char name[128], desc[128];
 345         size_t str_len = strlen (gni_statistic_str[i]);
 346 
 347         assert (str_len < sizeof (name));
 348 
 349         /* we can get an all-caps string for the variable from gni_statistic_str. need to make it lowercase
 350          * to match ompi standards */
 351         for (size_t j = 0 ; j < str_len ; ++j) {
 352             name[j] = tolower (gni_statistic_str[i][j]);
 353             desc[j] = ('_' == name[j]) ? ' ' : name[j];
 354         }
 355 
 356         name[str_len] = '\0';
 357         desc[str_len] = '\0';
 358 
 359         (void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc,
 360                                                  OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER,
 361                                                  MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
 362                                                  MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
 363                                                  mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat,
 364                                                  (void *) (intptr_t) i);
 365     }
 366 
 367     /* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource
 368      * structures) */
 369     rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum);
 370     if (OPAL_SUCCESS != rc) {
 371         return rc;
 372     }
 373 
 374     /* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is the preferred rcache */
 375     mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA;
 376     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 377                                            "rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum,
 378                                            0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 379                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type);
 380     OBJ_RELEASE(new_enum);
 381 
 382     if (mca_btl_ugni_ugni_page_size) {
 383         rc = opal_asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size);
 384         if (rc < 0) {
 385             return OPAL_ERR_OUT_OF_RESOURCE;
 386         }
 387 
 388         mca_btl_ugni_component.mpool_hints = mpool_hints_tmp;
 389     } else {
 390         mca_btl_ugni_component.mpool_hints = "page_size=2M";
 391     }
 392 
 393     (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
 394                                            "mpool_hints", "hints to use when selecting a memory pool (default: "
 395                                            "\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
 396                                            MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
 397                                            MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints);
 398     free (mpool_hints_tmp);
 399 
 400     /* ensure we loose send exclusivity to sm and vader if they are enabled */
 401     mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2;
 402 
 403     /* smsg threshold */
 404     mca_btl_ugni_module.super.btl_eager_limit               = 8 * 1024;
 405     mca_btl_ugni_module.super.btl_rndv_eager_limit          = 8 * 1024;
 406     mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size   = 4 * 1024 * 1024;
 407     mca_btl_ugni_module.super.btl_max_send_size             = 8 * 1024;
 408     mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
 409 
 410     mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
 411 
 412     /*
 413      * see def. of ALIGNMENT_MASK to figure this one out
 414      */
 415     /* both gemini and aries have a 4-byte alignment requirement on remote addresses */
 416     mca_btl_ugni_module.super.btl_get_alignment = 4;
 417 
 418     /* threshold for put */
 419     mca_btl_ugni_module.super.btl_min_rdma_pipeline_size    = 8 * 1024;
 420 
 421     mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
 422         MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
 423         MCA_BTL_FLAGS_ATOMIC_FOPS;
 424     mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
 425         MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
 426         MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
 427 
 428     if (GNI_DEVICE_ARIES == device_type) {
 429         /* aries supports additional atomic operations */
 430         mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX |
 431             MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR |
 432             MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT;
 433     }
 434 
 435     mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
 436 
 437     mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
 438     mca_btl_ugni_module.super.btl_latency   = 2;     /* Microsecs */
 439 
 440     mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
 441     mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
 442 
 443     /* Call the BTL based to register its MCA params */
 444     mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
 445                                 &mca_btl_ugni_module.super);
 446 
 447     return OPAL_SUCCESS;
 448 }
 449 
 450 static int
 451 btl_ugni_component_open(void)
 452 {
 453     mca_btl_ugni_component.ugni_num_btls = 0;
 454     mca_btl_ugni_component.modules = NULL;
 455 
 456     return OPAL_SUCCESS;
 457 }
 458 
 459 /*
 460  * component cleanup - sanity checking of queue lengths
 461  */
 462 static int
 463 btl_ugni_component_close(void)
 464 {
 465     mca_btl_ugni_fini ();
 466 
 467     free (mca_btl_ugni_component.modules);
 468     mca_btl_ugni_component.modules = NULL;
 469 
 470     return OPAL_SUCCESS;
 471 }
 472 
 473 static mca_btl_base_module_t **
 474 mca_btl_ugni_component_init (int *num_btl_modules,
 475                              bool enable_progress_threads,
 476                              bool enable_mpi_threads)
 477 {
 478     struct mca_btl_base_module_t **base_modules;
 479     mca_btl_ugni_module_t *ugni_modules;
 480     int rc;
 481 
 482     if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
 483         mca_btl_ugni_component.ugni_smsg_limit = 16384;
 484     }
 485 
 486     if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
 487         mca_btl_ugni_component.ugni_fma_limit = 65536;
 488     }
 489 
 490     if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
 491         mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit;
 492     } else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) {
 493         mca_btl_ugni_component.ugni_fma_get_limit = 65536;
 494     }
 495 
 496     if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
 497         mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit;
 498     } else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) {
 499         mca_btl_ugni_component.ugni_fma_put_limit = 65536;
 500     }
 501 
 502     mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
 503 
 504     /* limit the number of outstanding RDMA operations over all devices */
 505     mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size;
 506 
 507     if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
 508         mca_btl_ugni_component.progress_thread_enabled = 1;
 509     }
 510 
 511     /* Initialize ugni library and create communication domain */
 512     rc = mca_btl_ugni_init();
 513     if (OPAL_SUCCESS != rc) {
 514         return NULL;
 515     }
 516 
 517     /* For now only create a single BTL module */
 518     mca_btl_ugni_component.ugni_num_btls = 1;
 519 
 520     BTL_VERBOSE(("btl/ugni initializing"));
 521 
 522     ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
 523         calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t));
 524 
 525     if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
 526         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
 527         return NULL;
 528     }
 529 
 530     base_modules = (struct mca_btl_base_module_t **)
 531         calloc (mca_btl_ugni_component.ugni_num_btls,
 532                 sizeof (struct mca_btl_base_module_t *));
 533     if (OPAL_UNLIKELY(NULL == base_modules)) {
 534         BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
 535         return NULL;
 536     }
 537 
 538     if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) {
 539         if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) {
 540             mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
 541         }
 542     }
 543 
 544     mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
 545 
 546     rc = mca_btl_ugni_module_init (ugni_modules);
 547     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 548         BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
 549                    __LINE__));
 550         return NULL;
 551     }
 552 
 553     *base_modules = (mca_btl_base_module_t *) ugni_modules;
 554 
 555     *num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
 556 
 557     BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
 558 
 559     return base_modules;
 560 }
 561 
 562 int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
 563 {
 564     mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
 565     mca_btl_base_endpoint_t *ep = NULL;
 566     gni_ep_handle_t handle;
 567     int count = 0, rc;
 568 
 569     rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
 570     if (1 != rc) {
 571         return rc;
 572     }
 573 
 574     BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));
 575 
 576     /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
 577     if (handle == ugni_module->wildcard_ep) {
 578         struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
 579 
 580         BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
 581                      OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
 582 
 583         ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
 584         if (OPAL_UNLIKELY(NULL == ep)) {
 585             /* there is no way to recover from this error so just abort() */
 586             BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
 587                        OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
 588             abort ();
 589             return OPAL_ERR_NOT_FOUND;
 590         }
 591     }
 592 
 593     /* should not have gotten a NULL endpoint */
 594     assert (NULL != ep);
 595 
 596     BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));
 597 
 598     /* NTH: TODO -- error handling */
 599     opal_mutex_lock (&ep->lock);
 600     if (handle != ugni_module->wildcard_ep) {
 601         /* directed post complete */
 602         BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
 603 
 604         ep->dg_posted = false;
 605         (void) opal_atomic_add_fetch_32 (&ugni_module->active_datagrams, -1);
 606     }
 607 
 608     (void) mca_btl_ugni_ep_connect_progress (ep);
 609     opal_mutex_unlock (&ep->lock);
 610 
 611     if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
 612         /*  process messages waiting in the endpoint's smsg mailbox */
 613         count = mca_btl_ugni_smsg_process (ep);
 614     }
 615 
 616     /* repost the wildcard datagram */
 617     if (handle == ugni_module->wildcard_ep) {
 618         mca_btl_ugni_wildcard_ep_post (ugni_module);
 619     }
 620 
 621     return count;
 622 }
 623 
 624 void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
 625                                            struct mca_btl_ugni_post_descriptor_t *post_desc, const int count)
 626 {
 627     int bte_complete = 0;
 628 
 629     for (int i = 0 ; i < count ; ++i) {
 630         BTL_VERBOSE(("post descriptor complete. status: %d", post_desc[i].rc));
 631 
 632         if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) {
 633             /* dump the post descriptor if in a debug build */
 634             btl_ugni_dump_post_desc (post_desc + i);
 635         }
 636 
 637         bte_complete += post_desc[i].use_bte == true;
 638 
 639         mca_btl_ugni_post_desc_complete (ugni_module, post_desc + i, post_desc[i].rc);
 640     }
 641 
 642     if (bte_complete > 0)  {
 643         (void) OPAL_THREAD_FETCH_ADD32 (&ugni_module->active_rdma_count, -bte_complete);
 644     }
 645 }
 646 
 647 static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
 648                                               mca_btl_ugni_cq_t *cq)
 649 {
 650     mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
 651     int rc;
 652 
 653     rc = mca_btl_ugni_cq_get_completed_desc (device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
 654     if (0 >= rc) {
 655         return rc;
 656     }
 657 
 658     BTL_VERBOSE(("got %d completed rdma descriptors", rc));
 659 
 660     mca_btl_ugni_handle_rdma_completions (ugni_module, device, post_desc, rc);
 661 
 662     return rc;
 663 }
 664 
 665 static inline int
 666 mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
 667 {
 668     int rc = OPAL_SUCCESS;
 669     mca_btl_base_endpoint_t *endpoint = NULL;
 670     int count;
 671 
 672     if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
 673         return 0;
 674     }
 675 
 676     /* check the count before taking the lock to avoid unnecessary locking */
 677     count = opal_list_get_size(&ugni_module->ep_wait_list);
 678     if (0 == count) {
 679         return 0;
 680     }
 681 
 682     OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
 683     count = opal_list_get_size(&ugni_module->ep_wait_list);
 684     do {
 685         endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
 686         if (endpoint != NULL) {
 687             rc = mca_btl_ugni_progress_send_wait_list (endpoint);
 688 
 689             if (OPAL_SUCCESS != rc) {
 690                 opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
 691             } else {
 692                 endpoint->wait_listed = false;
 693             }
 694         }
 695     } while (endpoint != NULL && --count > 0) ;
 696     OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
 697 
 698     return rc;
 699 }
 700 
 701 static int mca_btl_ugni_component_progress (void)
 702 {
 703     mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
 704     int count = 0;
 705 
 706     count += mca_btl_ugni_progress_remote_smsg (ugni_module);
 707 
 708     if (ugni_module->active_datagrams) {
 709         count += mca_btl_ugni_progress_datagram (ugni_module->devices);
 710     }
 711 
 712     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
 713         mca_btl_ugni_device_t *device = ugni_module->devices + i;
 714 
 715         if (device->smsg_connections) {
 716             count += mca_btl_ugni_progress_local_smsg (ugni_module, device);
 717             mca_btl_ugni_progress_wait_list (ugni_module);
 718         }
 719 
 720         if (device->dev_rdma_local_cq.active_operations) {
 721             count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
 722         }
 723 
 724         if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) {
 725             count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq);
 726         }
 727     }
 728 
 729     return count;
 730 }
 731 
 732 int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint)
 733 {
 734     mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
 735 
 736     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
 737         mca_btl_ugni_device_t *device = ugni_module->devices + i;
 738         /* spin on progress until all active operations are complete. it is tempting to
 739          * take an initial count then wait until that many operations have been completed
 740          * but it is impossible to tell if those are the operations the caller is waiting
 741          * on. */
 742         while (device->dev_rdma_local_cq.active_operations) {
 743             (void) mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
 744         }
 745 
 746         /* mark that the device was recently flushed */
 747         device->flushed = true;
 748     }
 749 
 750     return OPAL_SUCCESS;
 751 }
 752 
 753 void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
 754 {
 755 
 756     fprintf (stderr, "desc->gni_desc.post_id          = %" PRIx64 "\n", desc->gni_desc.post_id);
 757     fprintf (stderr, "desc->gni_desc.status           = %" PRIx64 "\n", desc->gni_desc.status);
 758     fprintf (stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete);
 759     fprintf (stderr, "desc->gni_desc.type             = %d\n", desc->gni_desc.type);
 760     fprintf (stderr, "desc->gni_desc.cq_mode          = %hu\n", desc->gni_desc.cq_mode);
 761     fprintf (stderr, "desc->gni_desc.dlvr_mode        = %hu\n", desc->gni_desc.dlvr_mode);
 762     fprintf (stderr, "desc->gni_desc.local_addr       = %" PRIx64 "\n", desc->gni_desc.local_addr);
 763     fprintf (stderr, "desc->gni_desc.local_mem_hndl   = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1,
 764              desc->gni_desc.local_mem_hndl.qword2);
 765     fprintf (stderr, "desc->gni_desc.remote_addr      = %" PRIx64 "\n", desc->gni_desc.remote_addr);
 766     fprintf (stderr, "desc->gni_desc.remote_mem_hndl  = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1,
 767              desc->gni_desc.remote_mem_hndl.qword2);
 768     fprintf (stderr, "desc->gni_desc.length           = %" PRIu64 "\n", desc->gni_desc.length);
 769     fprintf (stderr, "desc->gni_desc.rdma_mode        = %hu\n", desc->gni_desc.rdma_mode);
 770     fprintf (stderr, "desc->gni_desc.amo_cmd          = %d\n", desc->gni_desc.amo_cmd);
 771 }

/* [<][>][^][v][top][bottom][index][help] */