root/opal/mca/btl/ofi/btl_ofi_context.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. init_context_freelists
  2. mca_btl_ofi_context_alloc_normal
  3. mca_btl_ofi_context_alloc_scalable
  4. mca_btl_ofi_context_finalize
  5. get_ofi_context
  6. get_ofi_context_rr
  7. mca_btl_ofi_context_progress

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * $COPYRIGHT$
   4  * Copyright (c) 2018      Intel Inc. All rights reserved
   5  * $COPYRIGHT$
   6  *
   7  * Additional copyrights may follow
   8  *
   9  * $HEADER$
  10  */
  11 
  12 #include "btl_ofi.h"
  13 #include "btl_ofi_frag.h"
  14 #include "btl_ofi_rdma.h"
  15 
  16 #if OPAL_HAVE_THREAD_LOCAL
  17 opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
  18 #endif /* OPAL_HAVE_THREAD_LOCAL */
  19 
  20 int init_context_freelists(mca_btl_ofi_context_t *context)
  21 {
  22     int rc;
  23     OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t);
  24     rc = opal_free_list_init(&context->rdma_comp_list,
  25                              sizeof(mca_btl_ofi_rdma_completion_t),
  26                              opal_cache_line_size,
  27                              OBJ_CLASS(mca_btl_ofi_rdma_completion_t),
  28                              0,
  29                              0,
  30                              512,
  31                              -1,
  32                              512,
  33                              NULL,
  34                              0,
  35                              NULL,
  36                              NULL,
  37                              NULL);
  38     if (rc != OPAL_SUCCESS) {
  39         BTL_VERBOSE(("cannot allocate completion freelist"));
  40         return rc;
  41     }
  42 
  43     if (TWO_SIDED_ENABLED) {
  44         OBJ_CONSTRUCT(&context->frag_comp_list, opal_free_list_t);
  45         rc = opal_free_list_init(&context->frag_comp_list,
  46                                  sizeof(mca_btl_ofi_frag_completion_t),
  47                                  opal_cache_line_size,
  48                                  OBJ_CLASS(mca_btl_ofi_frag_completion_t),
  49                                  0,
  50                                  0,
  51                                  512,
  52                                  -1,
  53                                  512,
  54                                  NULL,
  55                                  0,
  56                                  NULL,
  57                                  NULL,
  58                                  NULL);
  59         if (rc != OPAL_SUCCESS) {
  60             BTL_VERBOSE(("cannot allocate completion freelist"));
  61             return rc;
  62         }
  63 
  64         /* Initialize frag pool */
  65         OBJ_CONSTRUCT(&context->frag_list, opal_free_list_t);
  66         rc = opal_free_list_init(&context->frag_list,
  67                                  sizeof(mca_btl_ofi_base_frag_t) +
  68                                     MCA_BTL_OFI_FRAG_SIZE,
  69                                  opal_cache_line_size,
  70                                  OBJ_CLASS(mca_btl_ofi_base_frag_t),
  71                                  0,
  72                                  0,
  73                                  1024,
  74                                  -1,
  75                                  1024,
  76                                  NULL,
  77                                  0,
  78                                  NULL,
  79                                  NULL,
  80                                  NULL);
  81         if (OPAL_SUCCESS != rc) {
  82             BTL_VERBOSE(("failed to init frag pool (free_list)"));
  83         }
  84     }
  85 
  86     return rc;
  87 }
  88 
  89 /* mca_btl_ofi_context_alloc_normal()
  90  *
  91  * This function will allocate an ofi_context, map the endpoint to tx/rx context,
  92  * bind CQ,AV to the endpoint and initialize all the structure.
  93  * USE WITH NORMAL ENDPOINT ONLY */
  94 mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
  95                                                         struct fid_domain *domain,
  96                                                         struct fid_ep *ep,
  97                                                         struct fid_av *av)
  98 {
  99     int rc;
 100     uint32_t cq_flags = FI_TRANSMIT | FI_SEND | FI_RECV;
 101     char *linux_device_name = info->domain_attr->name;
 102 
 103     struct fi_cq_attr cq_attr = {0};
 104 
 105     mca_btl_ofi_context_t *context;
 106 
 107     context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
 108     if (NULL == context) {
 109         BTL_VERBOSE(("cannot allocate context"));
 110         return NULL;
 111     }
 112 
 113     /* Don't really need to check, just avoiding compiler warning because
 114      * BTL_VERBOSE is a no op in performance build and the compiler will
 115      * complain about unused variable. */
 116     if (NULL == linux_device_name) {
 117         BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
 118         goto single_fail;
 119     }
 120 
 121     cq_attr.format = FI_CQ_FORMAT_CONTEXT;
 122     cq_attr.wait_obj = FI_WAIT_NONE;
 123     rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
 124     if (0 != rc) {
 125         BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
 126                         linux_device_name,
 127                         fi_strerror(-rc)
 128                         ));
 129         goto single_fail;
 130     }
 131 
 132     rc = fi_ep_bind(ep, (fid_t)av, 0);
 133     if (0 != rc) {
 134         BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
 135                         linux_device_name,
 136                         fi_strerror(-rc)
 137                         ));
 138         goto single_fail;
 139     }
 140 
 141     rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
 142     if (0 != rc) {
 143         BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
 144                         linux_device_name,
 145                         fi_strerror(-rc)
 146                         ));
 147         goto single_fail;
 148     }
 149 
 150     rc = init_context_freelists(context);
 151     if (rc != OPAL_SUCCESS) {
 152         goto single_fail;
 153     }
 154 
 155     context->tx_ctx = ep;
 156     context->rx_ctx = ep;
 157     context->context_id = 0;
 158 
 159     return context;
 160 
 161 single_fail:
 162     mca_btl_ofi_context_finalize(context, false);
 163     return NULL;
 164 }
 165 
 166 /* mca_btl_ofi_context_alloc_scalable()
 167  *
 168  * This function allocate communication contexts and return the pointer
 169  * to the first btl context. It also take care of all the bindings needed.
 170  * USE WITH SCALABLE ENDPOINT ONLY */
 171 mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
 172                                                           struct fid_domain *domain,
 173                                                           struct fid_ep *sep,
 174                                                           struct fid_av *av,
 175                                                           size_t num_contexts)
 176 {
 177     BTL_VERBOSE(("creating %zu contexts", num_contexts));
 178 
 179     int rc;
 180     size_t i;
 181     char *linux_device_name = info->domain_attr->name;
 182 
 183     struct fi_cq_attr cq_attr = {0};
 184     struct fi_tx_attr tx_attr = {0};
 185     struct fi_rx_attr rx_attr = {0};
 186 
 187     mca_btl_ofi_context_t *contexts;
 188     tx_attr.op_flags = FI_DELIVERY_COMPLETE;
 189 
 190     contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
 191     if (NULL == contexts) {
 192         BTL_VERBOSE(("cannot allocate communication contexts."));
 193         return NULL;
 194     }
 195 
 196     /* Don't really need to check, just avoiding compiler warning because
 197      * BTL_VERBOSE is a no op in performance build and the compiler will
 198      * complain about unused variable. */
 199     if (NULL == linux_device_name) {
 200         BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
 201         goto scalable_fail;
 202     }
 203 
 204      /* bind AV to endpoint */
 205     rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
 206     if (0 != rc) {
 207         BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
 208                         linux_device_name,
 209                         fi_strerror(-rc)
 210                         ));
 211         goto scalable_fail;
 212     }
 213 
 214     for (i=0; i < num_contexts; i++) {
 215         rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
 216         if (0 != rc) {
 217             BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
 218                             linux_device_name,
 219                             fi_strerror(-rc)
 220                             ));
 221             goto scalable_fail;
 222         }
 223 
 224         /* We don't actually need a receiving context as we only do one-sided.
 225          * However, sockets provider will hang if we dont have one. It is
 226          * also nice to have equal number of tx/rx context. */
 227         rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
 228         if (0 != rc) {
 229             BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
 230                             linux_device_name,
 231                             fi_strerror(-rc)
 232                             ));
 233             goto scalable_fail;
 234         }
 235 
 236         /* create CQ */
 237         cq_attr.format = FI_CQ_FORMAT_CONTEXT;
 238         cq_attr.wait_obj = FI_WAIT_NONE;
 239         rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
 240         if (0 != rc) {
 241             BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
 242                             linux_device_name,
 243                             fi_strerror(-rc)
 244                             ));
 245             goto scalable_fail;
 246         }
 247 
 248         /* bind cq to transmit context */
 249         rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, FI_TRANSMIT);
 250         if (0 != rc) {
 251             BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
 252                             linux_device_name,
 253                             fi_strerror(-rc)
 254                             ));
 255             goto scalable_fail;
 256         }
 257 
 258         /* bind cq to receiving  context */
 259         if (TWO_SIDED_ENABLED) {
 260             rc = fi_ep_bind(contexts[i].rx_ctx, (fid_t)contexts[i].cq, FI_RECV);
 261             if (0 != rc) {
 262                 BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
 263                                 linux_device_name,
 264                                 fi_strerror(-rc)
 265                                 ));
 266                 goto scalable_fail;
 267             }
 268         }
 269 
 270         /* enable the context. */
 271         rc = fi_enable(contexts[i].tx_ctx);
 272         if (0 != rc) {
 273             BTL_VERBOSE(("%s failed fi_enable with err=%s",
 274                             linux_device_name,
 275                             fi_strerror(-rc)
 276                             ));
 277             goto scalable_fail;
 278         }
 279 
 280         rc = fi_enable(contexts[i].rx_ctx);
 281         if (0 != rc) {
 282             BTL_VERBOSE(("%s failed fi_enable with err=%s",
 283                             linux_device_name,
 284                             fi_strerror(-rc)
 285                             ));
 286             goto scalable_fail;
 287         }
 288 
 289         /* initialize freelists. */
 290         rc = init_context_freelists(&contexts[i]);
 291         if (rc != OPAL_SUCCESS) {
 292             goto scalable_fail;
 293         }
 294 
 295         /* assign the id */
 296         contexts[i].context_id = i;
 297     }
 298 
 299     return contexts;
 300 
 301 scalable_fail:
 302     /* close and free */
 303     for(i=0; i < num_contexts; i++) {
 304         mca_btl_ofi_context_finalize(&contexts[i], true);
 305     }
 306     free(contexts);
 307 
 308     return NULL;
 309 }
 310 
 311 void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
 312 
 313     /* if it is a scalable ep, we have to close all contexts. */
 314     if (scalable_ep) {
 315         if (NULL != context->tx_ctx) {
 316             fi_close(&context->tx_ctx->fid);
 317         }
 318 
 319         if (NULL != context->rx_ctx) {
 320             fi_close(&context->rx_ctx->fid);
 321         }
 322     }
 323 
 324     if( NULL != context->cq) {
 325         fi_close(&context->cq->fid);
 326     }
 327 
 328     /* Can we destruct the object that hasn't been constructed? */
 329     OBJ_DESTRUCT(&context->rdma_comp_list);
 330 
 331     if (TWO_SIDED_ENABLED) {
 332         OBJ_DESTRUCT(&context->frag_comp_list);
 333         OBJ_DESTRUCT(&context->frag_list);
 334     }
 335 }
 336 
 337 /* Get a context to use for communication.
 338  * If TLS is supported, it will use the cached endpoint.
 339  * If not, it will invoke the normal round-robin assignment. */
 340 mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
 341 {
 342 #if OPAL_HAVE_THREAD_LOCAL
 343     /* With TLS, we cache the context we use. */
 344     static volatile int64_t cur_num = 0;
 345 
 346     if (OPAL_UNLIKELY(my_context == NULL)) {
 347         OPAL_THREAD_LOCK(&btl->module_lock);
 348 
 349         my_context = &btl->contexts[cur_num];
 350         cur_num = (cur_num + 1) %btl->num_contexts;
 351 
 352         OPAL_THREAD_UNLOCK(&btl->module_lock);
 353     }
 354 
 355     assert (my_context);
 356     return my_context;
 357 #else
 358     return get_ofi_context_rr(btl);
 359 #endif
 360 }
 361 
 362 /* return the context in a round-robin. */
 363 /* There is no need for atomics here as it might hurt the performance. */
 364 mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
 365 {
 366     static volatile uint64_t rr_num = 0;
 367     return &btl->contexts[rr_num++%btl->num_contexts];
 368 }
 369 
 370 int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
 371 
 372     int ret = 0;
 373     int events_read;
 374     int events = 0;
 375     struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
 376     struct fi_cq_err_entry cqerr = {0};
 377 
 378     mca_btl_ofi_completion_context_t *c_ctx;
 379     mca_btl_ofi_base_completion_t *comp;
 380     mca_btl_ofi_rdma_completion_t *rdma_comp;
 381     mca_btl_ofi_frag_completion_t *frag_comp;
 382 
 383     ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
 384 
 385     if (0 < ret) {
 386         events_read = ret;
 387         for (int i = 0; i < events_read; i++) {
 388             if (NULL != cq_entry[i].op_context) {
 389                 ++events;
 390 
 391                 c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context;
 392 
 393                 /* We are casting to every type  here just for simplicity. */
 394                 comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp;
 395                 frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp;
 396                 rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp;
 397 
 398                 switch (comp->type) {
 399                 case MCA_BTL_OFI_TYPE_GET:
 400                 case MCA_BTL_OFI_TYPE_PUT:
 401                 case MCA_BTL_OFI_TYPE_AOP:
 402                 case MCA_BTL_OFI_TYPE_AFOP:
 403                 case MCA_BTL_OFI_TYPE_CSWAP:
 404                     /* call the callback */
 405                     if (rdma_comp->cbfunc) {
 406                         rdma_comp->cbfunc (comp->btl, comp->endpoint,
 407                                            rdma_comp->local_address, rdma_comp->local_handle,
 408                                            rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS);
 409                     }
 410 
 411                     MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl);
 412                     break;
 413 
 414                 case MCA_BTL_OFI_TYPE_RECV:
 415                     mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*)  comp->btl,
 416                                           (mca_btl_ofi_endpoint_t*) comp->endpoint,
 417                                           context, frag_comp->frag);
 418                     break;
 419 
 420                 case MCA_BTL_OFI_TYPE_SEND:
 421                     MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl);
 422                     mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
 423                     break;
 424 
 425                 default:
 426                     /* catasthrophic */
 427                     BTL_ERROR(("unknown completion type"));
 428                     MCA_BTL_OFI_ABORT();
 429                 }
 430 
 431                 /* return the completion handler */
 432                 opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
 433             }
 434         }
 435     } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
 436         ret = fi_cq_readerr(context->cq, &cqerr, 0);
 437 
 438         /* cq readerr failed!? */
 439         if (0 > ret) {
 440             BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
 441                        __FILE__, __LINE__, fi_strerror(-ret), ret));
 442         } else {
 443             BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
 444                        cqerr.prov_errno));
 445         }
 446         MCA_BTL_OFI_ABORT();
 447     }
 448 #ifdef FI_EINTR
 449     /* sometimes, sockets provider complain about interupt. We do nothing. */
 450     else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
 451 
 452     }
 453 #endif
 454     /* If the error is not FI_EAGAIN, report the error and abort. */
 455     else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
 456         BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
 457         MCA_BTL_OFI_ABORT();
 458     }
 459 
 460     return events;
 461 }
 462 
 463 

/* [<][>][^][v][top][bottom][index][help] */