root/opal/mca/btl/ofi/btl_ofi_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_btl_ofi_add_procs
  2. mca_btl_ofi_del_procs
  3. mca_btl_ofi_rcache_init
  4. mca_btl_ofi_register_mem
  5. mca_btl_ofi_deregister_mem
  6. mca_btl_ofi_reg_mem
  7. mca_btl_ofi_dereg_mem
  8. mca_btl_ofi_finalize
  9. mca_btl_ofi_post_recvs
  10. mca_btl_ofi_module_alloc

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2013 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
  14  *                         reserved.
  15  * Copyright (c) 2018      Intel, Inc, All rights reserved
  16  *
  17  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 
  25 #include "opal_config.h"
  26 #include <string.h>
  27 #include "opal/class/opal_bitmap.h"
  28 #include "opal/util/printf.h"
  29 #include "opal/mca/btl/btl.h"
  30 #include "opal/datatype/opal_convertor.h"
  31 #include "opal/mca/mpool/base/base.h"
  32 #include "opal/mca/mpool/mpool.h"
  33 
  34 #include "btl_ofi.h"
  35 #include "btl_ofi_endpoint.h"
  36 #include "btl_ofi_frag.h"
  37 
  38 static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
  39                                   size_t nprocs, opal_proc_t **opal_procs,
  40                                   mca_btl_base_endpoint_t **peers,
  41                                   opal_bitmap_t *reachable)
  42 {
  43     int rc;
  44     int count;
  45     char *ep_name = NULL;
  46     size_t namelen = mca_btl_ofi_component.namelen;
  47 
  48     opal_proc_t *proc;
  49     mca_btl_base_endpoint_t *ep;
  50 
  51     mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
  52 
  53     for (size_t i = 0 ; i < nprocs ; ++i) {
  54 
  55         proc = opal_procs[i];
  56 
  57         /* See if we already have an endpoint for this proc. */
  58         rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep);
  59 
  60         if (OPAL_SUCCESS == rc) {
  61             BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name)));
  62             peers[i] = ep;
  63 
  64         } else {
  65             /* We don't have this endpoint yet, create one */
  66             peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint);
  67             BTL_VERBOSE(("creating peer %p", (void*) peers[i]));
  68 
  69             if (OPAL_UNLIKELY(NULL == peers[i])) {
  70                 return OPAL_ERR_OUT_OF_RESOURCE;
  71             }
  72 
  73             /* Add this endpoint to the lookup table */
  74             (void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep);
  75         }
  76 
  77         OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
  78                         &peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen);
  79         if (OPAL_SUCCESS != rc) {
  80             BTL_ERROR(("error receiving modex"));
  81             MCA_BTL_OFI_ABORT();
  82         }
  83 
  84         /* get peer fi_addr */
  85         count = fi_av_insert(ofi_btl->av,      /* Address vector to insert */
  86                              ep_name,          /* peer name */
  87                              1,                /* amount to insert */
  88                              &peers[i]->peer_addr, /* return peer address here */
  89                              0,                /* flags */
  90                              NULL);            /* context */
  91 
  92         /* if succeed, add this proc and mark reachable */
  93         if (count == 1) { /* we inserted 1 address. */
  94             opal_list_append (&ofi_btl->endpoints, &peers[i]->super);
  95             opal_bitmap_set_bit(reachable, i);
  96         } else {
  97             BTL_VERBOSE(("fi_av_insert failed with rc = %d", count));
  98             MCA_BTL_OFI_ABORT();
  99         }
 100     }
 101 
 102     return OPAL_SUCCESS;
 103 }
 104 
 105 static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
 106                                   opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
 107 {
 108     int rc;
 109     mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
 110     mca_btl_base_endpoint_t *ep;
 111 
 112     for (size_t i = 0 ; i < nprocs ; ++i) {
 113         if (peers[i]) {
 114             rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep);
 115 
 116             if (OPAL_SUCCESS == rc) {
 117                 /* remove the address from AV. */
 118                 rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
 119                 if (rc < 0) {
 120                     /* remove failed. this should not happen. */
 121                     /* Lets not crash because we failed to remove an address. */
 122                     BTL_ERROR(("fi_av_remove failed with error %d:%s",
 123                                     rc, fi_strerror(-rc)));
 124                 }
 125 
 126                 /* remove and free MPI endpoint from the list. */
 127                 opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
 128                 (void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]);
 129                 OBJ_RELEASE(peers[i]);
 130            }
 131         }
 132     }
 133 
 134     return OPAL_SUCCESS;
 135 }
 136 
 137 void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module)
 138 {
 139     if (!module->initialized) {
 140         mca_rcache_base_resources_t rcache_resources;
 141         char *tmp;
 142 
 143         (void) opal_asprintf (&tmp, "ofi.%s", module->linux_device_name);
 144 
 145         rcache_resources.cache_name     = tmp;
 146         rcache_resources.reg_data       = (void *) module;
 147         rcache_resources.sizeof_reg     = sizeof (mca_btl_ofi_reg_t);
 148         rcache_resources.register_mem   = mca_btl_ofi_reg_mem;
 149         rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem;
 150 
 151         module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
 152         free (tmp);
 153 
 154         if (NULL == module->rcache) {
 155             /* something when horribly wrong */
 156             BTL_ERROR(("cannot create rcache"));
 157             MCA_BTL_OFI_ABORT();
 158         }
 159 
 160         module->initialized = true;
 161     }
 162 }
 163 
 164 
 165 /**
 166  * @brief Register a memory region for put/get/atomic operations.
 167  *
 168  * @param btl (IN)         BTL module
 169  * @param endpoint(IN)     BTL addressing information (or NULL for all endpoints)
 170  * @param base (IN)        Pointer to start of region
 171  * @param size (IN)        Size of region
 172  * @param flags (IN)       Flags indicating what operation will be performed. Valid
 173  *                         values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
 174  *                         and MCA_BTL_DES_FLAGS_ATOMIC
 175  *
 176  * @returns a memory registration handle valid for both local and remote operations
 177  * @returns NULL if the region could not be registered
 178  *
 179  * This function registers the specified region with the hardware for use with
 180  * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
 181  * functions. Care should be taken to not hold an excessive number of registrations
 182  * as they may use limited system/NIC resources.
 183  */
 184 static struct mca_btl_base_registration_handle_t *
 185 mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
 186                           size_t size, uint32_t flags)
 187 {
 188     mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
 189     mca_btl_ofi_reg_t *reg;
 190     int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
 191     int rc;
 192 
 193     rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags,
 194                                               (mca_rcache_base_registration_t **) &reg);
 195     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 196         return NULL;
 197     }
 198 
 199     return &reg->handle;
 200 }
 201 
 202 /**
 203  * @brief Deregister a memory region
 204  *
 205  * @param btl (IN)         BTL module region was registered with
 206  * @param handle (IN)      BTL registration handle to deregister
 207  *
 208  * This function deregisters the memory region associated with the specified handle. Care
 209  * should be taken to not perform any RDMA or atomic operation on this memory region
 210  * after it is deregistered. It is erroneous to specify a memory handle associated with
 211  * a remote node.
 212  */
 213 static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
 214 {
 215     mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
 216     mca_btl_ofi_reg_t *reg =
 217         (mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle));
 218 
 219     (void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, &reg->base);
 220 
 221     return OPAL_SUCCESS;
 222 }
 223 
 224 int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg)
 225 {
 226     int rc;
 227     static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE;
 228 
 229     mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data;
 230     mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg;
 231 
 232     rc = fi_mr_reg(btl->domain, base, size, access_flags, 0,
 233                    (uint64_t) reg, 0, &ur->ur_mr, NULL);
 234     if (0 != rc) {
 235         return OPAL_ERR_OUT_OF_RESOURCE;
 236     }
 237 
 238     ur->handle.rkey = fi_mr_key(ur->ur_mr);
 239     ur->handle.desc = fi_mr_desc(ur->ur_mr);
 240 
 241     /* In case the provider doesn't support FI_MR_VIRT_ADDR,
 242      * we need to reference the remote address by the distance from base registered
 243      * address. We keep this information to use in rdma/atomic operations. */
 244     if (btl->use_virt_addr) {
 245         ur->handle.base_addr = 0;
 246     } else {
 247         ur->handle.base_addr = base;
 248     }
 249 
 250     return OPAL_SUCCESS;
 251 }
 252 
 253 int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
 254 {
 255     mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg;
 256 
 257     if (ur->ur_mr != NULL) {
 258         if (0 != fi_close(&ur->ur_mr->fid)) {
 259             BTL_ERROR(("%s: error unpinning memory mr=%p: %s",
 260                        __func__, (void*) ur->ur_mr, strerror(errno)));
 261             return OPAL_ERROR;
 262         }
 263     }
 264 
 265     return OPAL_SUCCESS;
 266 }
 267 
 268 /*
 269  * Cleanup/release module resources.
 270  */
 271 
 272 int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
 273 {
 274     int i;
 275     mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
 276     mca_btl_ofi_endpoint_t *endpoint, *next;
 277 
 278     assert(btl);
 279 
 280     /* loop over all the contexts */
 281     for (i=0; i < ofi_btl->num_contexts; i++) {
 282         mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
 283     }
 284     free(ofi_btl->contexts);
 285 
 286     if (NULL != ofi_btl->av) {
 287         fi_close(&ofi_btl->av->fid);
 288     }
 289 
 290     if (NULL != ofi_btl->ofi_endpoint) {
 291         fi_close(&ofi_btl->ofi_endpoint->fid);
 292     }
 293 
 294     if (NULL != ofi_btl->domain) {
 295         fi_close(&ofi_btl->domain->fid);
 296     }
 297 
 298     if (NULL != ofi_btl->fabric) {
 299         fi_close(&ofi_btl->fabric->fid);
 300     }
 301 
 302     if (NULL != ofi_btl->fabric_info) {
 303         fi_freeinfo(ofi_btl->fabric_info);
 304     }
 305 
 306     /* clean up any leftover endpoints */
 307     OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) {
 308         opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super);
 309         OBJ_RELEASE(endpoint);
 310     }
 311 
 312     OBJ_DESTRUCT(&ofi_btl->endpoints);
 313     OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
 314     OBJ_DESTRUCT(&ofi_btl->module_lock);
 315 
 316     if (ofi_btl->rcache) {
 317         mca_rcache_base_module_destroy (ofi_btl->rcache);
 318         ofi_btl->rcache = NULL;
 319     }
 320 
 321     free (btl);
 322 
 323     return OPAL_SUCCESS;
 324 }
 325 
 326 /* Post wildcard recvs on the rx context. */
 327 int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module,
 328                             mca_btl_ofi_context_t *context,
 329                             int count)
 330 {
 331     int i;
 332     int rc;
 333     mca_btl_ofi_base_frag_t *frag;
 334     mca_btl_ofi_frag_completion_t *comp;
 335 
 336     for (i=0; i < count; i++) {
 337         frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module,
 338                                                      NULL,
 339                                                      0,
 340                                                      MCA_BTL_OFI_FRAG_SIZE,
 341                                                      MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
 342         if (NULL == frag) {
 343             BTL_ERROR(("cannot allocate recv frag."));
 344             return OPAL_ERROR;
 345         }
 346 
 347         comp = mca_btl_ofi_frag_completion_alloc (module,
 348                                                   context,
 349                                                   frag,
 350                                                   MCA_BTL_OFI_TYPE_RECV);
 351 
 352         rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE,
 353                       NULL, FI_ADDR_UNSPEC, &comp->comp_ctx);
 354 
 355         if (FI_SUCCESS != rc) {
 356             BTL_ERROR(("cannot post recvs"));
 357             return OPAL_ERROR;
 358         }
 359     }
 360     return OPAL_SUCCESS;
 361 }
 362 
 363 /* Allocate and fill out the module capabilities according to operation mode. */
 364 mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode)
 365 {
 366     mca_btl_ofi_module_t *module;
 367 
 368     /* allocate module */
 369     module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
 370     if (NULL == module) {
 371         return NULL;
 372     }
 373 
 374     /* fill in the defaults */
 375     *module = mca_btl_ofi_module_template;
 376 
 377     if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
 378 
 379         module->super.btl_put            = mca_btl_ofi_put;
 380         module->super.btl_get            = mca_btl_ofi_get;
 381         module->super.btl_atomic_op      = mca_btl_ofi_aop;
 382         module->super.btl_atomic_fop     = mca_btl_ofi_afop;
 383         module->super.btl_atomic_cswap   = mca_btl_ofi_acswap;
 384         module->super.btl_flush          = mca_btl_ofi_flush;
 385 
 386         module->super.btl_register_mem   = mca_btl_ofi_register_mem;
 387         module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem;
 388 
 389         module->super.btl_flags         |= MCA_BTL_FLAGS_ATOMIC_FOPS |
 390                                            MCA_BTL_FLAGS_ATOMIC_OPS |
 391                                            MCA_BTL_FLAGS_RDMA;
 392 
 393         module->super.btl_atomic_flags   = MCA_BTL_ATOMIC_SUPPORTS_ADD |
 394                                            MCA_BTL_ATOMIC_SUPPORTS_SWAP |
 395                                            MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
 396                                            MCA_BTL_ATOMIC_SUPPORTS_32BIT ;
 397 
 398         module->super.btl_put_limit = 1 << 23;
 399         module->super.btl_put_alignment = 0;
 400 
 401         module->super.btl_get_limit = 1 << 23;
 402         module->super.btl_get_alignment = 0;
 403 
 404         module->super.btl_registration_handle_size =
 405                                 sizeof(mca_btl_base_registration_handle_t);
 406     }
 407 
 408     if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
 409 
 410         module->super.btl_alloc          = mca_btl_ofi_alloc;
 411         module->super.btl_free           = mca_btl_ofi_free;
 412         module->super.btl_prepare_src    = mca_btl_ofi_prepare_src;
 413 
 414         module->super.btl_send           = mca_btl_ofi_send;
 415 
 416         module->super.btl_flags         |= MCA_BTL_FLAGS_SEND;
 417         module->super.btl_eager_limit    = MCA_BTL_OFI_FRAG_SIZE;
 418         module->super.btl_max_send_size  = MCA_BTL_OFI_FRAG_SIZE;
 419         module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
 420 
 421         /* If two sided is enabled, we expected that the user knows exactly what
 422          * they want. We bump the priority to maximum, making this BTL the default. */
 423         module->super.btl_exclusivity    = MCA_BTL_EXCLUSIVITY_HIGH;
 424     }
 425 
 426     if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
 427         module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
 428         module->super.btl_rdma_pipeline_send_length = 8 * 1024;
 429     }
 430 
 431     return module;
 432 }
 433 
 434 mca_btl_ofi_module_t mca_btl_ofi_module_template = {
 435     .super = {
 436         .btl_component      = &mca_btl_ofi_component.super,
 437         .btl_add_procs      = mca_btl_ofi_add_procs,
 438         .btl_del_procs      = mca_btl_ofi_del_procs,
 439         .btl_finalize       = mca_btl_ofi_finalize,
 440    }
 441 };

/* [<][>][^][v][top][bottom][index][help] */