root/opal/mca/btl/usnic/btl_usnic_compat.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. usnic_compat_modex_send
  2. usnic_compat_modex_recv
  3. usnic_compat_rte_hash_name
  4. usnic_compat_proc_name_print
  5. prepare_src_small
  6. pack_chunk_seg_chain_with_reserve
  7. prepare_src_large
  8. opal_btl_usnic_prepare_src
  9. opal_btl_usnic_put

   1 /*
   2  * Copyright (c) 2014-2019 Cisco Systems, Inc.  All rights reserved
   3  * Copyright (c) 2015      Intel, Inc. All rights reserved.
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  */
  10 
  11 #include "opal_config.h"
  12 #include "opal/mca/btl/btl.h"
  13 
  14 #include "opal/mca/mca.h"
  15 #include "opal_stdint.h"
  16 
  17 #include "btl_usnic_compat.h"
  18 #include "btl_usnic_frag.h"
  19 #include "btl_usnic_endpoint.h"
  20 #include "btl_usnic_connectivity.h"
  21 #include "btl_usnic_send.h"
  22 
  23 #include "opal/util/proc.h"
  24 
  25 void usnic_compat_modex_send(int *rc,
  26                              mca_base_component_t *component,
  27                              opal_btl_usnic_modex_t *modexes,
  28                              size_t size)
  29 {
  30     OPAL_MODEX_SEND(*rc, OPAL_PMIX_REMOTE, component,
  31                     modexes, size);
  32 }
  33 
  34 void usnic_compat_modex_recv(int *rc,
  35                              mca_base_component_t *component,
  36                              opal_proc_t *proc,
  37                              opal_btl_usnic_modex_t **modexes,
  38                              size_t *size)
  39 {
  40     OPAL_MODEX_RECV(*rc, component, &proc->proc_name,
  41                     (uint8_t**) modexes, size);
  42 }
  43 
  44 uint64_t usnic_compat_rte_hash_name(opal_process_name_t *pname)
  45 {
  46     uint64_t name = pname->jobid;
  47     name <<= 32;
  48     name += pname->vpid;
  49     return name;
  50 }
  51 
  52 const char *usnic_compat_proc_name_print(opal_process_name_t *pname)
  53 {
  54     return OPAL_NAME_PRINT(*pname);
  55 }
  56 
  57 /************************************************************************/
  58 
  59 /* Responsible for sending "small" frags (reserve + *size <= max_frag_payload)
  60  * in the same manner as btl_prepare_src.  Must return a smaller amount than
  61  * requested if the given convertor cannot process the entire (*size).
  62  */
  63 static inline opal_btl_usnic_send_frag_t *
  64 prepare_src_small(
  65     struct opal_btl_usnic_module_t* module,
  66     struct mca_btl_base_endpoint_t* endpoint,
  67     struct opal_convertor_t* convertor,
  68     uint8_t order,
  69     size_t reserve,
  70     size_t* size,
  71     uint32_t flags)
  72 {
  73     opal_btl_usnic_send_frag_t *frag;
  74     opal_btl_usnic_small_send_frag_t *sfrag;
  75     size_t payload_len;
  76 
  77     payload_len = *size + reserve;
  78     assert(payload_len <= module->max_frag_payload); /* precondition */
  79 
  80     sfrag = opal_btl_usnic_small_send_frag_alloc(module);
  81     if (OPAL_UNLIKELY(NULL == sfrag)) {
  82         return NULL;
  83     }
  84     frag = &sfrag->ssf_base;
  85 
  86     /* In the case of a convertor, we will copy the data in now, since that is
  87      * the cheapest way to discover how much we can actually send (since we know
  88      * we will pack it anyway later).  The alternative is to do all of the
  89      * following:
  90      * 1) clone_with_position(convertor) and see where the new position ends up
  91      *    actually being (see opal_btl_usnic_convertor_pack_peek).  Otherwise we
  92      *    aren't fulfilling our contract w.r.t. (*size).
  93      * 2) Add a bunch of branches checking for different cases, both here and in
  94      *    progress_sends
  95      * 3) If we choose to defer the packing, we must clone the convertor because
  96      *    the PML owns it and might reuse it for another prepare_src call.
  97      *
  98      * Two convertor clones is likely to be at least as slow as just copying the
  99      * data and might consume a similar amount of memory.  Plus we still have to
 100      * pack it later to send it.
 101      *
 102      * The reason we do not copy non-convertor buffer at this point is because
 103      * we might still use INLINE for the send, and in that case we do not want
 104      * to copy the data at all.
 105      */
 106     if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
 107         /* put user data just after end of 1st seg (upper layer header) */
 108         assert(payload_len <= module->max_frag_payload);
 109         usnic_convertor_pack_simple(
 110                 convertor,
 111                 (IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve),
 112                 *size,
 113                 size);
 114         payload_len = reserve + *size;
 115         frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
 116         /* PML will copy header into beginning of segment */
 117         frag->sf_base.uf_local_seg[0].seg_len = payload_len;
 118     } else {
 119         opal_convertor_get_current_pointer(convertor,
 120                                            &sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval);
 121         frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
 122         frag->sf_base.uf_local_seg[0].seg_len = reserve;
 123         frag->sf_base.uf_local_seg[1].seg_len = *size;
 124     }
 125 
 126     frag->sf_base.uf_base.des_flags = flags;
 127     frag->sf_endpoint = endpoint;
 128 
 129     return frag;
 130 }
 131 
 132 static void *
 133 pack_chunk_seg_chain_with_reserve(
 134     struct opal_btl_usnic_module_t* module,
 135     opal_btl_usnic_large_send_frag_t *lfrag,
 136     size_t reserve_len,
 137     opal_convertor_t *convertor,
 138     size_t max_convertor_bytes,
 139     size_t *convertor_bytes_packed)
 140 {
 141     opal_btl_usnic_chunk_segment_t *seg;
 142     void *ret_ptr = NULL;
 143     int n_segs;
 144     uint8_t *copyptr;
 145     size_t copylen;
 146     size_t seg_space;
 147     size_t max_data;
 148     bool first_pass;
 149 
 150     assert(NULL != lfrag);
 151     assert(NULL != convertor_bytes_packed);
 152 
 153     n_segs = 0;
 154     *convertor_bytes_packed = 0;
 155 
 156     first_pass = true;
 157     while (*convertor_bytes_packed < max_convertor_bytes ||
 158            first_pass) {
 159         seg = opal_btl_usnic_chunk_segment_alloc(module);
 160         if (OPAL_UNLIKELY(NULL == seg)) {
 161             opal_btl_usnic_util_abort("chunk segment allocation error",
 162                                       __FILE__, __LINE__);
 163         }
 164         ++n_segs;
 165 
 166         seg_space = module->max_chunk_payload;
 167         copyptr = seg->ss_base.us_payload.raw;
 168 
 169         if (first_pass) {
 170             /* logic could accommodate >max, but currently doesn't */
 171             assert(reserve_len <= module->max_chunk_payload);
 172             ret_ptr = copyptr;
 173             seg_space -= reserve_len;
 174             copyptr += reserve_len;
 175         }
 176 
 177         /* now pack any convertor data */
 178         if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) {
 179             copylen = max_convertor_bytes - *convertor_bytes_packed;
 180             if (copylen > seg_space) {
 181                 copylen = seg_space;
 182             }
 183             usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data);
 184             seg_space -= max_data;
 185             *convertor_bytes_packed += max_data;
 186 
 187             /* If unable to pack any of the remaining bytes, release the
 188             * most recently allocated segment and finish processing.
 189             */
 190             if (seg_space == module->max_chunk_payload) {
 191                 assert(max_data == 0); /* only way this can happen */
 192                 opal_btl_usnic_chunk_segment_return(module, seg);
 193                 break;
 194             }
 195         }
 196 
 197         /* bozo checks */
 198         assert(seg_space >= 0);
 199         assert(seg_space < module->max_chunk_payload);
 200 
 201         /* append segment of data to chain to send */
 202         seg->ss_parent_frag = &lfrag->lsf_base;
 203         seg->ss_len = module->max_chunk_payload - seg_space;
 204         opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super);
 205 
 206 #if MSGDEBUG1
 207         opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n",
 208                     __func__, (void *)seg, (void *)lfrag,
 209                     (module->max_chunk_payload - seg_space));
 210 #endif
 211 
 212         first_pass = false;
 213     }
 214 
 215     return ret_ptr;
 216 }
 217 
 218 /* Responsible for handling "large" frags (reserve + *size > max_frag_payload)
 219  * in the same manner as btl_prepare_src.  Must return a smaller amount than
 220  * requested if the given convertor cannot process the entire (*size).
 221  */
 222 static opal_btl_usnic_send_frag_t *
 223 prepare_src_large(
 224     struct opal_btl_usnic_module_t* module,
 225     struct mca_btl_base_endpoint_t* endpoint,
 226     struct opal_convertor_t* convertor,
 227     uint8_t order,
 228     size_t reserve,
 229     size_t* size,
 230     uint32_t flags)
 231 {
 232     opal_btl_usnic_send_frag_t *frag;
 233     opal_btl_usnic_large_send_frag_t *lfrag;
 234     int rc;
 235 
 236     /* Get holder for the msg */
 237     lfrag = opal_btl_usnic_large_send_frag_alloc(module);
 238     if (OPAL_UNLIKELY(NULL == lfrag)) {
 239         return NULL;
 240     }
 241     frag = &lfrag->lsf_base;
 242 
 243     /* The header location goes in SG[0], payload in SG[1].  If we are using a
 244      * convertor then SG[1].seg_len is accurate but seg_addr is NULL. */
 245     frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
 246 
 247     /* stash header location, PML will write here */
 248     frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header;
 249     frag->sf_base.uf_local_seg[0].seg_len = reserve;
 250     /* make sure upper header small enough */
 251     assert(reserve <= sizeof(lfrag->lsf_ompi_header));
 252 
 253     if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
 254         /* threshold == -1 means always pack eagerly */
 255         if (mca_btl_usnic_component.pack_lazy_threshold >= 0 &&
 256             *size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) {
 257             MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag);
 258             lfrag->lsf_pack_on_the_fly = true;
 259 
 260             /* tell the PML we will absorb as much as possible while still
 261              * respecting indivisible element boundaries in the convertor */
 262             *size = opal_btl_usnic_convertor_pack_peek(convertor, *size);
 263 
 264             /* Clone the convertor b/c we (the BTL) don't own it and the PML
 265              * might mutate it after we return from this function. */
 266             rc = opal_convertor_clone(convertor, &frag->sf_convertor,
 267                                       /*copy_stack=*/true);
 268             if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 269                 opal_btl_usnic_util_abort("unexpected convertor clone error",
 270                                           __FILE__, __LINE__);
 271             }
 272         }
 273         else {
 274             /* pack everything in the convertor into a chain of segments now,
 275              * leaving space for the PML header in the first segment */
 276             lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval =
 277                 pack_chunk_seg_chain_with_reserve(module, lfrag, reserve,
 278                                                   convertor, *size, size);
 279         }
 280 
 281         /* We set SG[1] to {NULL,bytes_packed} so that various calculations
 282          * by both PML and this BTL will be correct.  For example, the PML adds
 283          * up the bytes in the descriptor segments to determine if an MPI-level
 284          * request is complete or not. */
 285         frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL;
 286         frag->sf_base.uf_local_seg[1].seg_len = *size;
 287     } else {
 288         /* convertor not needed, just save the payload pointer in SG[1] */
 289         lfrag->lsf_pack_on_the_fly = true;
 290         opal_convertor_get_current_pointer(convertor,
 291                                            &frag->sf_base.uf_local_seg[1].seg_addr.pval);
 292         frag->sf_base.uf_local_seg[1].seg_len = *size;
 293     }
 294 
 295     frag->sf_base.uf_base.des_flags = flags;
 296     frag->sf_endpoint = endpoint;
 297 
 298     return frag;
 299 }
 300 
 301 /*----------------------------------------------------------------------*/
 302 
 303 /*
 304  * BTL 3.0 prepare_src function.
 305  *
 306  * This function is only used for sending PML fragments (not putting
 307  * or getting fragments).
 308  *
 309  * Note the "user" data the PML wishes to communicate and return a
 310  * descriptor.  We create a frag (which is also a descriptor by virtue
 311  * of its base class) and populate it with enough source information
 312  * to complete a future send.
 313  *
 314  * Recall that the usnic BTL's max_send_size is almost certainly
 315  * larger than the MTU (by default, max_send_size is either 25K or
 316  * 150K).  Therefore, the PML may give us a fragment up to
 317  * max_send_size in this function.  Hence, we make the decision here
 318  * as to whether it's a "small" fragment (i.e., size <= MTU, meaning
 319  * that it fits in a single datagram) or a "large" fragment (i.e.,
 320  * size > MTU, meaning that it must be chunked into multiple
 321  * datagrams).
 322  *
 323  * The convertor will be saved for deferred packing if the user buffer
 324  * is noncontiguous.  Otherwise, it will be saved in one of the
 325  * descriptor's SGEs.
 326  *
 327  * NOTE that the *only* reason this routine is allowed to return a size smaller
 328  * than was requested is if the convertor cannot process the entire amount.
 329  */
 330 struct mca_btl_base_descriptor_t *
 331 opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module,
 332                            struct mca_btl_base_endpoint_t *endpoint,
 333                            struct opal_convertor_t *convertor,
 334                            uint8_t order,
 335                            size_t reserve,
 336                            size_t *size,
 337                            uint32_t flags)
 338 {
 339     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
 340     opal_btl_usnic_send_frag_t *frag;
 341     uint32_t payload_len;
 342 #if MSGDEBUG2
 343     size_t osize = *size;
 344 #endif
 345 
 346     /* Do we need to check the connectivity?  If enabled, we'll check
 347        the connectivity at either first send to peer X or first ACK to
 348        peer X. */
 349     opal_btl_usnic_check_connectivity(module, endpoint);
 350 
 351     /*
 352      * if total payload len fits in one MTU use small send, else large
 353      */
 354     payload_len = *size + reserve;
 355     if (payload_len <= module->max_frag_payload) {
 356         frag = prepare_src_small(module, endpoint, convertor,
 357                                  order, reserve, size, flags);
 358     } else {
 359         frag = prepare_src_large(module, endpoint, convertor,
 360                                  order, reserve, size, flags);
 361     }
 362 
 363 #if MSGDEBUG2
 364     opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
 365                 module->linux_device_name,
 366                 (reserve + *size) <= module->max_frag_payload?"small":"large",
 367                 (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
 368                 (void *)convertor);
 369 #if MSGDEBUG1
 370     {
 371         unsigned i;
 372         mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base;
 373         for (i=0; i<desc->USNIC_SEND_LOCAL_COUNT; ++i) {
 374             opal_output(0, "  %d: ptr:%p len:%d\n", i,
 375                         (void *)desc->USNIC_SEND_LOCAL[i].seg_addr.pval,
 376                         desc->USNIC_SEND_LOCAL[i].seg_len);
 377         }
 378     }
 379 #endif
 380 #endif
 381 
 382     return &frag->sf_base.uf_base;
 383 }
 384 
 385 /*
 386  * BTL 3.0 version of module.btl_put.
 387  *
 388  * Emulate an RDMA put.  We'll send the remote address across to the
 389  * other side so it will know where to put the data.
 390  *
 391  * Note that this function is only ever called with contiguous
 392  * buffers, so a convertor is not necessary.
 393  */
 394 int
 395 opal_btl_usnic_put(struct mca_btl_base_module_t *base_module,
 396                    struct mca_btl_base_endpoint_t *endpoint,
 397                    void *local_address, uint64_t remote_address,
 398                    struct mca_btl_base_registration_handle_t *local_handle,
 399                    struct mca_btl_base_registration_handle_t *remote_handle,
 400                    size_t size, int flags, int order,
 401                    mca_btl_base_rdma_completion_fn_t cbfunc,
 402                    void *cbcontext, void *cbdata)
 403 {
 404     opal_btl_usnic_send_frag_t *sfrag;
 405     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
 406 
 407     /* At least for the moment, continue to make a descriptor, like we
 408        used to in BTL 2.0 */
 409     if (size <= module->max_frag_payload) {
 410         /* Small send fragment -- the whole thing fits in one MTU
 411            (i.e., a single chunk) */
 412         opal_btl_usnic_small_send_frag_t *ssfrag;
 413         ssfrag = opal_btl_usnic_small_send_frag_alloc(module);
 414         if (OPAL_UNLIKELY(NULL == ssfrag)) {
 415             return OPAL_ERR_OUT_OF_RESOURCE;
 416         }
 417 
 418         sfrag = &ssfrag->ssf_base;
 419     } else {
 420         /* Large send fragment -- need more than one MTU (i.e.,
 421            multiple chunks) */
 422         opal_btl_usnic_large_send_frag_t *lsfrag;
 423         lsfrag = opal_btl_usnic_large_send_frag_alloc(module);
 424         if (OPAL_UNLIKELY(NULL == lsfrag)) {
 425             return OPAL_ERR_OUT_OF_RESOURCE;
 426         }
 427 
 428         lsfrag->lsf_pack_on_the_fly = true;
 429 
 430         sfrag = &lsfrag->lsf_base;
 431     }
 432 
 433     sfrag->sf_endpoint = endpoint;
 434     sfrag->sf_size = size;
 435     sfrag->sf_ack_bytes_left = size;
 436 
 437     opal_btl_usnic_frag_t *frag;
 438     frag = &sfrag->sf_base;
 439     frag->uf_local_seg[0].seg_len = size;
 440     frag->uf_local_seg[0].seg_addr.pval = local_address;
 441     frag->uf_remote_seg[0].seg_len = size;
 442     frag->uf_remote_seg[0].seg_addr.pval =
 443         (void *)(uintptr_t) remote_address;
 444 
 445     mca_btl_base_descriptor_t *desc;
 446     desc = &frag->uf_base;
 447     desc->des_segment_count = 1;
 448     desc->des_segments = &frag->uf_local_seg[0];
 449     /* This is really the wrong cbfunc type, but we'll cast it to
 450        the Right type before we use it.  So it'll be ok. */
 451     desc->des_cbfunc = (mca_btl_base_completion_fn_t) cbfunc;
 452     desc->des_cbdata = cbdata;
 453     desc->des_context = cbcontext;
 454     desc->des_flags = flags;
 455     desc->order = order;
 456 
 457     int rc;
 458     rc = opal_btl_usnic_finish_put_or_send(module,
 459                                            (opal_btl_usnic_endpoint_t *)endpoint,
 460                                            sfrag,
 461                                            /*tag=*/MCA_BTL_NO_ORDER);
 462     return rc;
 463 }

/* [<][>][^][v][top][bottom][index][help] */