root/opal/mca/btl/ugni/btl_ugni_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_btl_ugni_datagram_event
  2. mca_btl_ugni_module_init
  3. mca_btl_ugni_module_finalize
  4. mca_btl_ugni_alloc
  5. mca_btl_ugni_free
  6. mca_btl_ugni_prepare_src
  7. mca_btl_ugni_register_mem
  8. mca_btl_ugni_deregister_mem
  9. mca_btl_ugni_event_fatal_error
  10. mca_btl_ugni_device_handle_event_error

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
   4  *                         reserved.
   5  * Copyright (c) 2011      UT-Battelle, LLC. All rights reserved.
   6  * Copyright (c) 2014-2016 Research Organization for Information Science
   7  *                         and Technology (RIST). All rights reserved.
   8  * Copyright (c) 2017      Intel, Inc.  All rights reserved.
   9  * $COPYRIGHT$
  10  *
  11  * Additional copyrights may follow
  12  *
  13  * $HEADER$
  14  */
  15 
  16 #include "opal_config.h"
  17 
  18 #include "btl_ugni.h"
  19 #include "btl_ugni_frag.h"
  20 #include "btl_ugni_endpoint.h"
  21 #include "btl_ugni_prepare.h"
  22 #include "btl_ugni_smsg.h"
  23 
  24 static int
  25 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
  26                    mca_btl_base_descriptor_t *des);
  27 
  28 static int
  29 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl);
  30 
  31 static struct mca_btl_base_descriptor_t *
  32 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
  33                           struct mca_btl_base_endpoint_t *endpoint,
  34                           struct opal_convertor_t *convertor,
  35                           uint8_t order, size_t reserve, size_t *size,
  36                           uint32_t flags);
  37 
  38 static mca_btl_base_registration_handle_t *
  39 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
  40                            size_t size, uint32_t flags);
  41 
  42 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
  43 
  44 mca_btl_ugni_module_t mca_btl_ugni_module = {
  45     .super = {
  46         .btl_component      = &mca_btl_ugni_component.super,
  47         .btl_add_procs      = mca_btl_ugni_add_procs,
  48         .btl_del_procs      = mca_btl_ugni_del_procs,
  49         .btl_finalize       = mca_btl_ugni_module_finalize,
  50         .btl_alloc          = mca_btl_ugni_alloc,
  51         .btl_free           = mca_btl_ugni_free,
  52         .btl_prepare_src    = mca_btl_ugni_prepare_src,
  53         .btl_send           = mca_btl_ugni_send,
  54         .btl_sendi          = mca_btl_ugni_sendi,
  55         .btl_put            = mca_btl_ugni_put,
  56         .btl_get            = mca_btl_ugni_get,
  57         .btl_register_mem   = mca_btl_ugni_register_mem,
  58         .btl_deregister_mem = mca_btl_ugni_deregister_mem,
  59         .btl_atomic_op      = mca_btl_ugni_aop,
  60         .btl_atomic_fop     = mca_btl_ugni_afop,
  61         .btl_atomic_cswap   = mca_btl_ugni_acswap,
  62         .btl_flush          = mca_btl_ugni_flush,
  63     }
  64 };
  65 
  66 static void mca_btl_ugni_datagram_event (int foo, short bar, void *arg)
  67 {
  68     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) arg;
  69     mca_btl_ugni_device_t *device = ugni_module->devices;
  70     struct timeval tv = {.tv_sec = 0, .tv_usec = MCA_BTL_UGNI_CONNECT_USEC};
  71 
  72     mca_btl_ugni_progress_datagram (device);
  73 
  74     opal_event_evtimer_add (&ugni_module->connection_event, &tv);
  75 }
  76 
  77 int
  78 mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
  79 {
  80     int rc;
  81 
  82     BTL_VERBOSE(("binding module %p to device 0", (void *) ugni_module));
  83 
  84     /* copy module defaults (and function pointers) */
  85     memmove (ugni_module, &mca_btl_ugni_module, sizeof (mca_btl_ugni_module));
  86 
  87     ugni_module->initialized = false;
  88     ugni_module->nlocal_procs = 0;
  89     ugni_module->active_datagrams = 0;
  90     ugni_module->active_rdma_count = 0;
  91 
  92     opal_event_evtimer_set (opal_sync_event_base, &ugni_module->connection_event,
  93                             mca_btl_ugni_datagram_event, ugni_module);
  94 
  95     OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
  96     OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
  97 
  98     OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
  99     OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t);
 100 
 101     for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
 102         OBJ_CONSTRUCT(ugni_module->frags_lists + i, opal_free_list_t);
 103     }
 104 
 105     OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
 106     OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
 107     OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
 108     OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t);
 109     OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
 110     OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
 111     OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
 112     OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
 113 
 114     /* set up virtual device handles */
 115     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
 116         rc = mca_btl_ugni_device_init (ugni_module->devices + i, i);
 117         if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 118             BTL_VERBOSE(("error initializing uGNI device handle"));
 119             return rc;
 120         }
 121     }
 122 
 123     /* create wildcard endpoint on first device to listen for connections.
 124      * there is no need to bind this endpoint. We are single threaded
 125      * here so there is no need for a device lock. */
 126     rc = GNI_EpCreate (ugni_module->devices[0].dev_handle, NULL,
 127                        &ugni_module->wildcard_ep);
 128     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 129         BTL_ERROR(("error creating wildcard ugni endpoint"));
 130         return mca_btl_rc_ugni_to_opal (rc);
 131     }
 132 
 133     /* post wildcard datagram */
 134     rc = mca_btl_ugni_wildcard_ep_post (ugni_module);
 135     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 136         BTL_ERROR(("error posting wildcard datagram"));
 137         return rc;
 138     }
 139 
 140     return OPAL_SUCCESS;
 141 }
 142 
 143 static int
 144 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
 145 {
 146     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *)btl;
 147     mca_btl_base_endpoint_t *ep;
 148     uint64_t key;
 149     int rc;
 150 
 151     if (ugni_module->initialized) {
 152         /* close all open connections and release endpoints */
 153         OPAL_HASH_TABLE_FOREACH(key, uint64, ep, &ugni_module->id_to_endpoint) {
 154             if (NULL != ep) {
 155                 mca_btl_ugni_release_ep (ep);
 156             }
 157         }
 158 
 159         if (mca_btl_ugni_component.progress_thread_enabled) {
 160             mca_btl_ugni_kill_progress_thread();
 161         }
 162 
 163         /* destroy all cqs */
 164         rc = GNI_CqDestroy (ugni_module->smsg_remote_cq);
 165         if (GNI_RC_SUCCESS != rc) {
 166             BTL_ERROR(("error tearing down RX SMSG CQ - %s",gni_err_str[rc]));
 167         }
 168 
 169         if (mca_btl_ugni_component.progress_thread_enabled) {
 170             rc = GNI_CqDestroy (ugni_module->smsg_remote_irq_cq);
 171             if (GNI_RC_SUCCESS != rc) {
 172                 BTL_ERROR(("error tearing down remote SMSG CQ - %s",gni_err_str[rc]));
 173             }
 174         }
 175 
 176         /* cancel wildcard post */
 177         rc = GNI_EpPostDataCancelById (ugni_module->wildcard_ep,
 178                                        MCA_BTL_UGNI_CONNECT_WILDCARD_ID |
 179                                        OPAL_PROC_MY_NAME.vpid);
 180         if (GNI_RC_SUCCESS != rc) {
 181             BTL_VERBOSE(("btl/ugni error cancelling wildcard post"));
 182         }
 183 
 184         /* tear down wildcard endpoint */
 185         rc = GNI_EpDestroy (ugni_module->wildcard_ep);
 186         if (GNI_RC_SUCCESS != rc) {
 187             BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc]));
 188         }
 189 
 190         opal_event_del (&ugni_module->connection_event);
 191     }
 192 
 193     for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
 194         OBJ_DESTRUCT(ugni_module->frags_lists + i);
 195     }
 196 
 197     OBJ_DESTRUCT(&ugni_module->ep_wait_list);
 198     OBJ_DESTRUCT(&ugni_module->smsg_mboxes);
 199     OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
 200     OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
 201     OBJ_DESTRUCT(&ugni_module->endpoint_lock);
 202     OBJ_DESTRUCT(&ugni_module->endpoints);
 203 
 204     OBJ_DESTRUCT(&ugni_module->eager_get_pending);
 205     OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
 206 
 207     if (ugni_module->rcache) {
 208         mca_rcache_base_module_destroy (ugni_module->rcache);
 209     }
 210 
 211     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
 212         mca_btl_ugni_device_fini (ugni_module->devices + i);
 213     }
 214 
 215     ugni_module->initialized = false;
 216 
 217     return OPAL_SUCCESS;
 218 }
 219 
 220 
 221 mca_btl_base_descriptor_t *
 222 mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
 223                    struct mca_btl_base_endpoint_t *endpoint,
 224                    uint8_t order, size_t size, uint32_t flags)
 225 {
 226     mca_btl_ugni_base_frag_t *frag = NULL;
 227 
 228     /* do not allocate a fragment unless the wait list is relatively small. this
 229      * reduces the potential for resource exhaustion. note the wait list only exists
 230      * because we have no way to notify the sender that credits are available. */
 231     if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
 232         return NULL;
 233     }
 234 
 235     if (size <= mca_btl_ugni_component.smsg_max_data) {
 236         frag = mca_btl_ugni_frag_alloc_smsg (endpoint);
 237     } else if (size <= btl->btl_eager_limit) {
 238         frag = mca_btl_ugni_frag_alloc_eager_send (endpoint);
 239     }
 240 
 241     if (OPAL_UNLIKELY(NULL == frag)) {
 242         return NULL;
 243     }
 244 
 245     BTL_VERBOSE(("btl/ugni_module allocated frag of size: %u, flags: %x. frag = %p",
 246                  (unsigned int)size, flags, (void *) frag));
 247 
 248     frag->base.des_flags = flags;
 249     frag->base.order = order;
 250     frag->base.des_segments = &frag->segments[1];
 251     frag->base.des_segment_count = 1;
 252 
 253     frag->segments[0].seg_addr.pval = NULL;
 254     frag->segments[0].seg_len       = 0;
 255     frag->segments[1].seg_addr.pval = frag->base.super.ptr;
 256     frag->segments[1].seg_len       = size;
 257 
 258     frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED;
 259     if (size > mca_btl_ugni_component.smsg_max_data) {
 260         mca_btl_ugni_reg_t *registration;
 261 
 262         frag->hdr_size = sizeof (frag->hdr.eager);
 263         frag->flags    |= MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
 264 
 265         registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
 266 
 267         frag->hdr.eager.memory_handle = registration->handle;
 268     } else {
 269         frag->hdr_size = sizeof (frag->hdr.send);
 270     }
 271 
 272     return &frag->base;
 273 }
 274 
 275 static int
 276 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
 277                    mca_btl_base_descriptor_t *des)
 278 {
 279     return mca_btl_ugni_frag_return ((mca_btl_ugni_base_frag_t *) des);
 280 }
 281 
 282 static struct mca_btl_base_descriptor_t *
 283 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
 284                           mca_btl_base_endpoint_t *endpoint,
 285                           struct opal_convertor_t *convertor,
 286                           uint8_t order, size_t reserve, size_t *size,
 287                           uint32_t flags)
 288 {
 289     /* do not allocate a fragment unless the wait list is relatively small. this
 290      * reduces the potential for resource exhaustion. note the wait list only exists
 291      * because we have no way to notify the sender that credits are available. */
 292     if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
 293         return NULL;
 294     }
 295 
 296     return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
 297                                           order, reserve, size, flags);
 298 }
 299 
 300 static mca_btl_base_registration_handle_t *
 301 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
 302                            size_t size, uint32_t flags)
 303 {
 304     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
 305     mca_btl_ugni_reg_t *reg;
 306     int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
 307     int rc;
 308 
 309     rc = ugni_module->rcache->rcache_register (ugni_module->rcache, base, size, 0, access_flags,
 310                                                (mca_rcache_base_registration_t **) &reg);
 311     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 312         return NULL;
 313     }
 314 
 315     return &reg->handle;
 316 }
 317 
 318 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
 319 {
 320     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
 321     mca_btl_ugni_reg_t *reg =
 322         (mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle));
 323 
 324     (void) ugni_module->rcache->rcache_deregister (ugni_module->rcache, &reg->base);
 325 
 326     return OPAL_SUCCESS;
 327 }
 328 
 329 int mca_btl_ugni_event_fatal_error (gni_return_t grc, gni_cq_entry_t event_data)
 330 {
 331     /* combined error check for get event and get completed. we might miss exactly
 332      * what happened but it is unrecoverable anyway. fwiw, this error path has
 333      * never been seen in production. */
 334     if (GNI_CQ_OVERRUN(event_data)) {
 335         /* TODO -- need to handle overrun -- how do we do this without an event?
 336            will the event eventually come back? Ask Cray */
 337         BTL_ERROR(("CQ overrun detected in RDMA event data. can not recover"));
 338     } else {
 339         BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
 340     }
 341 
 342     return mca_btl_rc_ugni_to_opal (grc);
 343 }
 344 
 345 int mca_btl_ugni_device_handle_event_error (mca_btl_ugni_rdma_desc_t *rdma_desc, gni_cq_entry_t event_data)
 346 {
 347     mca_btl_ugni_device_t *device = rdma_desc->device;
 348     uint32_t recoverable = 1;
 349 
 350     (void) GNI_CqErrorRecoverable (event_data, &recoverable);
 351 
 352     if (OPAL_UNLIKELY(++rdma_desc->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) {
 353         char char_buffer[1024];
 354         GNI_CqErrorStr (event_data, char_buffer, sizeof (char_buffer));
 355 
 356         BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) rdma_desc, recoverable, char_buffer));
 357 
 358         return OPAL_ERROR;
 359     }
 360 
 361     return _mca_btl_ugni_repost_rdma_desc_device (device, rdma_desc);
 362 }

/* [<][>][^][v][top][bottom][index][help] */