root/ompi/mca/pml/ob1/pml_ob1_sendreq.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_pml_ob1_send_request_process_pending
  2. mca_pml_ob1_send_request_free
  3. mca_pml_ob1_send_request_cancel
  4. mca_pml_ob1_send_request_construct
  5. mca_pml_ob1_send_request_destruct
  6. mca_pml_ob1_match_completion_free_request
  7. mca_pml_ob1_match_completion_free
  8. mca_pml_ob1_rndv_completion_request
  9. mca_pml_ob1_rndv_completion
  10. mca_pml_ob1_rget_completion
  11. mca_pml_ob1_send_ctl_completion
  12. mca_pml_ob1_frag_completion
  13. mca_pml_ob1_copy_frag_completion
  14. mca_pml_ob1_send_request_start_buffered
  15. mca_pml_ob1_send_request_start_copy
  16. mca_pml_ob1_send_request_start_prepare
  17. mca_pml_ob1_send_request_start_rdma
  18. mca_pml_ob1_send_request_start_rndv
  19. mca_pml_ob1_send_request_copy_in_out
  20. get_send_range_nolock
  21. get_send_range
  22. get_next_send_range
  23. mca_pml_ob1_send_request_schedule_once
  24. mca_pml_ob1_send_request_put_frag_failed
  25. mca_pml_ob1_put_completion
  26. mca_pml_ob1_send_request_put_frag
  27. mca_pml_ob1_send_request_put

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2019 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
  14  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
  15  * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
  16  * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
  17  *                         reserved.
  18  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
  19  * Copyright (c) 2016      Research Organization for Information Science
  20  *                         and Technology (RIST). All rights reserved.
  21  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  22  * $COPYRIGHT$
  23  *
  24  * Additional copyrights may follow
  25  *
  26  * $HEADER$
  27  */
  28 
  29 
  30 #include "ompi_config.h"
  31 #include "opal/prefetch.h"
  32 #include "opal/mca/mpool/mpool.h"
  33 #include "ompi/runtime/ompi_spc.h"
  34 #include "ompi/constants.h"
  35 #include "ompi/mca/pml/pml.h"
  36 #include "pml_ob1.h"
  37 #include "pml_ob1_hdr.h"
  38 #include "pml_ob1_sendreq.h"
  39 #include "pml_ob1_rdmafrag.h"
  40 #include "pml_ob1_recvreq.h"
  41 #include "ompi/mca/bml/base/base.h"
  42 #include "ompi/memchecker.h"
  43 
  44 OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, opal_free_list_item_t,
  45         NULL, NULL);
  46 
  47 void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
  48 {
  49     int rc, i, s = opal_list_get_size(&mca_pml_ob1.send_pending);
  50 
  51     /* advance pending requests */
  52     for(i = 0; i < s; i++) {
  53         mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE;
  54         mca_pml_ob1_send_request_t* sendreq;
  55         mca_bml_base_btl_t *send_dst;
  56 
  57         sendreq = get_request_from_send_pending(&pending_type);
  58         if(OPAL_UNLIKELY(NULL == sendreq))
  59             break;
  60 
  61         switch(pending_type) {
  62         case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
  63             rc = mca_pml_ob1_send_request_schedule_exclusive(sendreq);
  64             if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
  65                 return;
  66             }
  67             break;
  68         case MCA_PML_OB1_SEND_PENDING_START:
  69             send_dst = mca_bml_base_btl_array_find(
  70                     &sendreq->req_endpoint->btl_eager, bml_btl->btl);
  71             if (NULL == send_dst) {
  72                 /* Put request back onto pending list and try next one. */
  73                 add_request_to_send_pending(sendreq,
  74                         MCA_PML_OB1_SEND_PENDING_START, true);
  75             } else {
  76                 MCA_PML_OB1_SEND_REQUEST_RESET(sendreq);
  77                 rc = mca_pml_ob1_send_request_start_btl(sendreq, send_dst);
  78                 if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
  79                     /* No more resources on this btl so prepend to the pending
  80                      * list to minimize reordering and give up for now. */
  81                     add_request_to_send_pending(sendreq,
  82                             MCA_PML_OB1_SEND_PENDING_START, false);
  83                     return;
  84                 }
  85             }
  86             break;
  87         default:
  88             opal_output(0, "[%s:%d] wrong send request type\n",
  89                     __FILE__, __LINE__);
  90             break;
  91         }
  92     }
  93 }
  94 
  95 /*
  96  * The free call mark the final stage in a request life-cycle. Starting from this
  97  * point the request is completed at both PML and user level, and can be used
  98  * for others p2p communications. Therefore, in the case of the OB1 PML it should
  99  * be added to the free request list.
 100  */
 101 static int mca_pml_ob1_send_request_free(struct ompi_request_t** request)
 102 {
 103     mca_pml_ob1_send_request_t* sendreq = *(mca_pml_ob1_send_request_t**)request;
 104     if(false == sendreq->req_send.req_base.req_free_called) {
 105 
 106         sendreq->req_send.req_base.req_free_called = true;
 107         PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY,
 108                              &(sendreq->req_send.req_base), PERUSE_SEND );
 109 
 110         if( true == sendreq->req_send.req_base.req_pml_complete ) {
 111             /* make buffer defined when the request is compeleted,
 112                and before releasing the objects. */
 113             MEMCHECKER(
 114                 memchecker_call(&opal_memchecker_base_mem_defined,
 115                                 sendreq->req_send.req_base.req_addr,
 116                                 sendreq->req_send.req_base.req_count,
 117                                 sendreq->req_send.req_base.req_datatype);
 118             );
 119 
 120             MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
 121         }
 122         *request = MPI_REQUEST_NULL;
 123     }
 124     return OMPI_SUCCESS;
 125 }
 126 
 127 static int mca_pml_ob1_send_request_cancel(struct ompi_request_t* request, int complete)
 128 {
 129     /* we dont cancel send requests by now */
 130     return OMPI_SUCCESS;
 131 }
 132 
 133 static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
 134 {
 135     req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND;
 136     req->req_send.req_base.req_ompi.req_start = mca_pml_ob1_start;
 137     req->req_send.req_base.req_ompi.req_free = mca_pml_ob1_send_request_free;
 138     req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
 139     req->req_rdma_cnt = 0;
 140     req->req_throttle_sends = false;
 141     req->rdma_frag = NULL;
 142     OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
 143     OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
 144 }
 145 
 146 static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
 147 {
 148     OBJ_DESTRUCT(&req->req_send_ranges);
 149     OBJ_DESTRUCT(&req->req_send_range_lock);
 150     assert( NULL == req->rdma_frag );
 151 }
 152 
 153 OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
 154                     mca_pml_base_send_request_t,
 155                     mca_pml_ob1_send_request_construct,
 156                     mca_pml_ob1_send_request_destruct );
 157 
 158 /**
 159  * Completion of a short message - nothing left to schedule.
 160  */
 161 
 162 static inline void
 163 mca_pml_ob1_match_completion_free_request( mca_bml_base_btl_t* bml_btl,
 164                                            mca_pml_ob1_send_request_t* sendreq )
 165 {
 166     if( sendreq->req_send.req_bytes_packed > 0 ) {
 167         PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
 168                                  &(sendreq->req_send.req_base), PERUSE_SEND );
 169     }
 170 
 171     /* signal request completion */
 172     send_request_pml_complete(sendreq);
 173 
 174     /* check for pending requests */
 175     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 176 }
 177 
 178 static void
 179 mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
 180                                    struct mca_btl_base_endpoint_t* ep,
 181                                    struct mca_btl_base_descriptor_t* des,
 182                                    int status )
 183 {
 184     mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
 185     mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
 186 
 187     /* check completion status */
 188     if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
 189         /* TSW - FIX */
 190         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
 191         ompi_rte_abort(-1, NULL);
 192     }
 193     mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
 194 }
 195 
 196 static inline void
 197 mca_pml_ob1_rndv_completion_request( mca_bml_base_btl_t* bml_btl,
 198                                      mca_pml_ob1_send_request_t* sendreq,
 199                                      size_t req_bytes_delivered )
 200 {
 201     if( sendreq->req_send.req_bytes_packed > 0 ) {
 202         PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
 203                                  &(sendreq->req_send.req_base), PERUSE_SEND );
 204     }
 205 
 206     OPAL_THREAD_ADD_FETCH_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
 207     SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)req_bytes_delivered,
 208                     OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 209 
 210     /* advance the request */
 211     OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, -1);
 212 
 213     send_request_pml_complete_check(sendreq);
 214 
 215     /* check for pending requests */
 216     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 217 }
 218 
 219 /*
 220  *  Completion of the first fragment of a long message that
 221  *  requires an acknowledgement
 222  */
 223 static void
 224 mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
 225                              struct mca_btl_base_endpoint_t* ep,
 226                              struct mca_btl_base_descriptor_t* des,
 227                              int status )
 228 {
 229     mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
 230     mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
 231     size_t req_bytes_delivered;
 232 
 233     /* check completion status */
 234     if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
 235         /* TSW - FIX */
 236         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
 237         ompi_rte_abort(-1, NULL);
 238     }
 239 
 240     /* count bytes of user data actually delivered. As the rndv completion only
 241      * happens in one thread, the increase of the req_bytes_delivered does not
 242      * have to be atomic.
 243      */
 244     req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
 245                                                                    des->des_segment_count,
 246                                                                    sizeof(mca_pml_ob1_rendezvous_hdr_t));
 247 
 248     mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
 249 }
 250 
 251 
 252 /**
 253  * Completion of a get request.
 254  */
 255 
 256 static void
 257 mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
 258 {
 259     mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
 260     mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
 261     size_t frag_remaining;
 262 
 263     /* count bytes of user data actually delivered and check for request completion */
 264     if (OPAL_LIKELY(0 < rdma_length)) {
 265         frag_remaining = OPAL_THREAD_SUB_FETCH_SIZE_T(&frag->rdma_bytes_remaining, (size_t)rdma_length);
 266         SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)rdma_length,
 267                         OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 268 
 269         if( 0 == frag_remaining ) {  /* this frag is now completed. Update the request and be done */
 270             OPAL_THREAD_ADD_FETCH_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
 271             if( sendreq->rdma_frag == frag )
 272                 sendreq->rdma_frag = NULL;
 273             MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
 274         }
 275     }
 276 
 277     send_request_pml_complete_check(sendreq);
 278 
 279     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 280 }
 281 
 282 
 283 /**
 284  * Completion of a control message - return resources.
 285  */
 286 
 287 static void
 288 mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl,
 289                                  struct mca_btl_base_endpoint_t* ep,
 290                                  struct mca_btl_base_descriptor_t* des,
 291                                  int status )
 292 {
 293     mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
 294 
 295     /* check for pending requests */
 296     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 297 }
 298 
 299 /**
 300  * Completion of additional fragments of a large message - may need
 301  * to schedule additional fragments.
 302  */
 303 
 304 static void
 305 mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
 306                              struct mca_btl_base_endpoint_t* ep,
 307                              struct mca_btl_base_descriptor_t* des,
 308                              int status )
 309 {
 310     mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
 311     mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
 312     size_t req_bytes_delivered;
 313 
 314     /* check completion status */
 315     if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
 316         /* TSW - FIX */
 317         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
 318         ompi_rte_abort(-1, NULL);
 319     }
 320 
 321     /* count bytes of user data actually delivered */
 322     req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
 323                                                                    des->des_segment_count,
 324                                                                    sizeof(mca_pml_ob1_frag_hdr_t));
 325 
 326     OPAL_THREAD_ADD_FETCH32(&sendreq->req_pipeline_depth, -1);
 327     OPAL_THREAD_ADD_FETCH_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
 328     SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)req_bytes_delivered,
 329                     OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 330 
 331     if(send_request_pml_complete_check(sendreq) == false) {
 332         mca_pml_ob1_send_request_schedule(sendreq);
 333     }
 334 
 335     /* check for pending requests */
 336     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 337 }
 338 
 339 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
 340 /**
 341  * This function is called when the copy of the frag from the GPU buffer
 342  * to the internal buffer is complete.  Used to support asynchronous
 343  * copies from GPU to host buffers. Now the data can be sent.
 344  */
 345 static void
 346 mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
 347                                   struct mca_btl_base_endpoint_t* ep,
 348                                   struct mca_btl_base_descriptor_t* des,
 349                                   int status )
 350 {
 351     int rc;
 352     mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
 353 
 354     des->des_cbfunc = mca_pml_ob1_frag_completion;
 355     /* Reset the BTL onwership flag as the BTL can free it after completion. */
 356     des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
 357     OPAL_OUTPUT((-1, "copy_frag_completion FRAG frag=%p", (void *)des));
 358     /* Currently, we cannot support a failure in the send.  In the blocking
 359      * case, the counters tracking the fragments being sent are not adjusted
 360      * until the function returns success, so it handles the error by leaving
 361      * all the buffer counters intact.  In this case, it is too late so
 362      * we just abort.  In theory, a new queue could be created to hold this
 363      * fragment and then attempt to send it out on another BTL. */
 364     rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
 365     if(OPAL_UNLIKELY(rc < 0)) {
 366         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
 367         ompi_rte_abort(-1, NULL);
 368     }
 369 }
 370 #endif /* OPAL_CUDA_SUPPORT */
 371 
 372 /**
 373  *  Buffer the entire message and mark as complete.
 374  */
 375 
 376 int mca_pml_ob1_send_request_start_buffered(
 377     mca_pml_ob1_send_request_t* sendreq,
 378     mca_bml_base_btl_t* bml_btl,
 379     size_t size)
 380 {
 381     mca_btl_base_descriptor_t* des;
 382     mca_btl_base_segment_t* segment;
 383     mca_pml_ob1_hdr_t* hdr;
 384     struct iovec iov;
 385     unsigned int iov_count;
 386     size_t max_data, req_bytes_delivered;
 387     int rc;
 388 
 389     /* allocate descriptor */
 390     mca_bml_base_alloc(bml_btl, &des,
 391                        MCA_BTL_NO_ORDER,
 392                        sizeof(mca_pml_ob1_rendezvous_hdr_t) + size,
 393                        MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
 394                        MCA_BTL_DES_FLAGS_SIGNAL);
 395     if( OPAL_UNLIKELY(NULL == des) ) {
 396         return OMPI_ERR_OUT_OF_RESOURCE;
 397     }
 398     segment = des->des_segments;
 399 
 400     /* pack the data into the BTL supplied buffer */
 401     iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
 402                                     sizeof(mca_pml_ob1_rendezvous_hdr_t));
 403     iov.iov_len = size;
 404     iov_count = 1;
 405     max_data = size;
 406     if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
 407                                   &iov,
 408                                   &iov_count,
 409                                   &max_data)) < 0) {
 410         mca_bml_base_free(bml_btl, des);
 411         return rc;
 412     }
 413     req_bytes_delivered = max_data;
 414 
 415     /* build rendezvous header */
 416     hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
 417     mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
 418                                         sendreq->req_send.req_base.req_comm->c_contextid,
 419                                         sendreq->req_send.req_base.req_comm->c_my_rank,
 420                                         sendreq->req_send.req_base.req_tag,
 421                                         (uint16_t)sendreq->req_send.req_base.req_sequence,
 422                                         sendreq->req_send.req_bytes_packed, sendreq);
 423 
 424     ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
 425 
 426     /* update lengths */
 427     segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
 428 
 429     des->des_cbfunc = mca_pml_ob1_rndv_completion;
 430     des->des_cbdata = sendreq;
 431 
 432     /* buffer the remainder of the message if it is not buffered yet */
 433     if( OPAL_LIKELY(sendreq->req_send.req_addr == sendreq->req_send.req_base.req_addr) ) {
 434         rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq);
 435         if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
 436             mca_bml_base_free(bml_btl, des);
 437             return rc;
 438         }
 439 
 440         iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data);
 441         iov.iov_len = max_data = sendreq->req_send.req_bytes_packed - max_data;
 442 
 443         if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
 444                                       &iov,
 445                                       &iov_count,
 446                                       &max_data)) < 0) {
 447             mca_bml_base_free(bml_btl, des);
 448             return rc;
 449         }
 450 
 451         /* re-init convertor for packed data */
 452         opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor,
 453                                          &(ompi_mpi_byte.dt.super),
 454                                          sendreq->req_send.req_bytes_packed,
 455                                          sendreq->req_send.req_addr );
 456     }
 457 
 458     /* wait for ack and completion */
 459     sendreq->req_state = 2;
 460 
 461     /* request is complete at mpi level */
 462     MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
 463 
 464     /* send */
 465     rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
 466     if( OPAL_LIKELY( rc >= 0 ) ) {
 467         if( OPAL_LIKELY( 1 == rc ) ) {
 468             mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered);
 469         }
 470         return OMPI_SUCCESS;
 471     }
 472     mca_bml_base_free(bml_btl, des );
 473     return rc;
 474 }
 475 
 476 
 477 /**
 478  *  We work on a buffered request with a size smaller than the eager size
 479  *  or the BTL is not able to send the data IN_PLACE. Request a segment
 480  *  that is used for initial hdr and any eager data. This is used only
 481  *  from the _START macro.
 482  */
 483 int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
 484                                          mca_bml_base_btl_t* bml_btl,
 485                                          size_t size )
 486 {
 487     mca_btl_base_descriptor_t* des = NULL;
 488     mca_btl_base_segment_t* segment;
 489     mca_pml_ob1_hdr_t* hdr;
 490     struct iovec iov;
 491     unsigned int iov_count;
 492     size_t max_data = size;
 493     int rc;
 494 
 495     if(NULL != bml_btl->btl->btl_sendi) {
 496         mca_pml_ob1_match_hdr_t match;
 497         mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
 498                                        sendreq->req_send.req_base.req_comm->c_contextid,
 499                                        sendreq->req_send.req_base.req_comm->c_my_rank,
 500                                        sendreq->req_send.req_base.req_tag,
 501                                        (uint16_t)sendreq->req_send.req_base.req_sequence);
 502 
 503         ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
 504 
 505         /* try to send immediately */
 506         rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
 507                                  &match, OMPI_PML_OB1_MATCH_HDR_LEN,
 508                                  size, MCA_BTL_NO_ORDER,
 509                                  MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
 510                                  MCA_PML_OB1_HDR_TYPE_MATCH,
 511                                  &des);
 512         if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
 513             /* signal request completion */
 514             SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)size,
 515                             OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 516             send_request_pml_complete(sendreq);
 517             return OMPI_SUCCESS;
 518         }
 519 
 520         /* just in case the btl changed the converter, reset it */
 521         if (size > 0 && NULL != des) {
 522             MCA_PML_OB1_SEND_REQUEST_RESET(sendreq);
 523         }
 524     } else {
 525         /* allocate descriptor */
 526         mca_bml_base_alloc( bml_btl, &des,
 527                             MCA_BTL_NO_ORDER,
 528                             OMPI_PML_OB1_MATCH_HDR_LEN + size,
 529                             MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
 530     }
 531     if( OPAL_UNLIKELY(NULL == des) ) {
 532         return OMPI_ERR_OUT_OF_RESOURCE;
 533     }
 534 
 535     segment = des->des_segments;
 536 
 537     if(size > 0) {
 538         /* pack the data into the supplied buffer */
 539         iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
 540                                        OMPI_PML_OB1_MATCH_HDR_LEN);
 541         iov.iov_len  = size;
 542         iov_count    = 1;
 543         /*
 544          * Before copy the user buffer, make the target part
 545          * accessible.
 546          */
 547         MEMCHECKER(
 548             memchecker_call(&opal_memchecker_base_mem_defined,
 549                             sendreq->req_send.req_base.req_addr,
 550                             sendreq->req_send.req_base.req_count,
 551                             sendreq->req_send.req_base.req_datatype);
 552         );
 553         (void)opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
 554                                    &iov, &iov_count, &max_data );
 555          /*
 556           *  Packing finished, make the user buffer unaccessable.
 557           */
 558         MEMCHECKER(
 559             memchecker_call(&opal_memchecker_base_mem_noaccess,
 560                             sendreq->req_send.req_base.req_addr,
 561                             sendreq->req_send.req_base.req_count,
 562                             sendreq->req_send.req_base.req_datatype);
 563         );
 564     }
 565 
 566 
 567     /* build match header */
 568     hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
 569     mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
 570                                    sendreq->req_send.req_base.req_comm->c_contextid,
 571                                    sendreq->req_send.req_base.req_comm->c_my_rank,
 572                                    sendreq->req_send.req_base.req_tag,
 573                                    (uint16_t)sendreq->req_send.req_base.req_sequence);
 574 
 575     ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
 576 
 577     /* update lengths */
 578     segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
 579 
 580     /* short message */
 581     des->des_cbdata = sendreq;
 582     des->des_cbfunc = mca_pml_ob1_match_completion_free;
 583 
 584     /* send */
 585     rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH);
 586     SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)size,
 587                     OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 588     if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) {
 589         if( OPAL_LIKELY( 1 == rc ) ) {
 590             mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
 591         }
 592         return OMPI_SUCCESS;
 593     }
 594 
 595     if (OMPI_ERR_RESOURCE_BUSY == rc) {
 596         /* No more resources. Allow the upper level to queue the send */
 597         rc = OMPI_ERR_OUT_OF_RESOURCE;
 598     }
 599 
 600     mca_bml_base_free (bml_btl, des);
 601 
 602     return rc;
 603 }
 604 
 605 /**
 606  *  BTL can send directly from user buffer so allow the BTL
 607  *  to prepare the segment list. Start sending a small message.
 608  */
 609 
 610 int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
 611                                             mca_bml_base_btl_t* bml_btl,
 612                                             size_t size )
 613 {
 614     mca_btl_base_descriptor_t* des;
 615     mca_btl_base_segment_t* segment;
 616     mca_pml_ob1_hdr_t* hdr;
 617     int rc;
 618 
 619     /* prepare descriptor */
 620     mca_bml_base_prepare_src( bml_btl,
 621                               &sendreq->req_send.req_base.req_convertor,
 622                               MCA_BTL_NO_ORDER,
 623                               OMPI_PML_OB1_MATCH_HDR_LEN,
 624                               &size,
 625                               MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
 626                               &des );
 627     if( OPAL_UNLIKELY(NULL == des) ) {
 628         return OMPI_ERR_OUT_OF_RESOURCE;
 629     }
 630     segment = des->des_segments;
 631 
 632     /* build match header */
 633     hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
 634     mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
 635                                    sendreq->req_send.req_base.req_comm->c_contextid,
 636                                    sendreq->req_send.req_base.req_comm->c_my_rank,
 637                                    sendreq->req_send.req_base.req_tag,
 638                                    (uint16_t)sendreq->req_send.req_base.req_sequence);
 639 
 640     ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
 641 
 642     /* short message */
 643     des->des_cbfunc = mca_pml_ob1_match_completion_free;
 644     des->des_cbdata = sendreq;
 645 
 646     /* send */
 647     rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH);
 648     SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)size,
 649                     OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 650     if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) {
 651         if( OPAL_LIKELY( 1 == rc ) ) {
 652             mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
 653         }
 654         return OMPI_SUCCESS;
 655     }
 656     mca_bml_base_free(bml_btl, des );
 657     return rc;
 658 }
 659 
 660 
 661 /**
 662  *  We have contigous data that is registered - schedule across
 663  *  available nics.
 664  */
 665 
 666 int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
 667                                          mca_bml_base_btl_t* bml_btl,
 668                                          size_t size )
 669 {
 670     /*
 671      * When req_rdma array is constructed the first element of the array always
 672      * assigned different btl in round robin fashion (if there are more than
 673      * one RDMA capable BTLs). This way round robin distribution of RDMA
 674      * operation is achieved.
 675      */
 676     mca_btl_base_registration_handle_t *local_handle;
 677     mca_btl_base_descriptor_t *des;
 678     mca_pml_ob1_rdma_frag_t *frag;
 679     mca_pml_ob1_rget_hdr_t *hdr;
 680     size_t reg_size;
 681     void *data_ptr;
 682     int rc;
 683 
 684     bml_btl = sendreq->req_rdma[0].bml_btl;
 685     if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
 686         sendreq->rdma_frag = NULL;
 687         /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
 688         return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
 689                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
 690     }
 691 
 692     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
 693      * contiguous block of memory */
 694     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
 695 
 696     local_handle = sendreq->req_rdma[0].btl_reg;
 697 
 698     /* allocate an rdma fragment to keep track of the request size for use in the fin message */
 699     MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
 700     if (OPAL_UNLIKELY(NULL == frag)) {
 701         return OPAL_ERR_OUT_OF_RESOURCE;
 702     }
 703 
 704     /* fill in necessary fragment data */
 705     frag->rdma_req = sendreq;
 706     frag->rdma_bml = bml_btl;
 707     frag->rdma_length = size;
 708     frag->rdma_bytes_remaining = size;
 709     frag->cbfunc = mca_pml_ob1_rget_completion;
 710     /* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
 711 
 712     reg_size = bml_btl->btl->btl_registration_handle_size;
 713 
 714     /* allocate space for get hdr + segment list */
 715     mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
 716                        MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
 717                        MCA_BTL_DES_FLAGS_SIGNAL);
 718     if( OPAL_UNLIKELY(NULL == des) ) {
 719         /* NTH: no need to reset the converter here. it will be reset before it is retried */
 720         MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
 721         return OMPI_ERR_OUT_OF_RESOURCE;
 722     }
 723 
 724     /* save the fragment for get->put fallback */
 725     sendreq->rdma_frag = frag;
 726 
 727     /* build match header */
 728     hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
 729     /* TODO -- Add support for multiple segments for get */
 730     mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
 731                                   sendreq->req_send.req_base.req_comm->c_contextid,
 732                                   sendreq->req_send.req_base.req_comm->c_my_rank,
 733                                   sendreq->req_send.req_base.req_tag,
 734                                   (uint16_t)sendreq->req_send.req_base.req_sequence,
 735                                   sendreq->req_send.req_bytes_packed, sendreq,
 736                                   frag, data_ptr, local_handle, reg_size);
 737 
 738     ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
 739 
 740     des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
 741     des->des_cbdata = sendreq;
 742 
 743     /**
 744      * Well, it's a get so we will not know when the peer will get the data anyway.
 745      * If we generate the PERUSE event here, at least we will know when we
 746      * sent the GET message ...
 747      */
 748     if( sendreq->req_send.req_bytes_packed > 0 ) {
 749         PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
 750                                  &(sendreq->req_send.req_base), PERUSE_SEND );
 751     }
 752 
 753     /* send */
 754     rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
 755     if (OPAL_UNLIKELY(rc < 0)) {
 756         mca_bml_base_free(bml_btl, des);
 757         return rc;
 758     }
 759 
 760     return OMPI_SUCCESS;
 761 }
 762 
 763 
 764 /**
 765  *  Rendezvous is required. Not doing rdma so eager send up to
 766  *  the btls eager limit.
 767  */
 768 
 769 int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
 770                                          mca_bml_base_btl_t* bml_btl,
 771                                          size_t size,
 772                                          int flags )
 773 {
 774     mca_btl_base_descriptor_t* des;
 775     mca_btl_base_segment_t* segment;
 776     mca_pml_ob1_hdr_t* hdr;
 777     int rc;
 778 
 779     /* prepare descriptor */
 780     if(size == 0) {
 781         mca_bml_base_alloc( bml_btl,
 782                             &des,
 783                             MCA_BTL_NO_ORDER,
 784                             sizeof(mca_pml_ob1_rendezvous_hdr_t),
 785                             MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP );
 786     } else {
 787         MEMCHECKER(
 788             memchecker_call(&opal_memchecker_base_mem_defined,
 789                             sendreq->req_send.req_base.req_addr,
 790                             sendreq->req_send.req_base.req_count,
 791                             sendreq->req_send.req_base.req_datatype);
 792         );
 793         mca_bml_base_prepare_src( bml_btl,
 794                                   &sendreq->req_send.req_base.req_convertor,
 795                                   MCA_BTL_NO_ORDER,
 796                                   sizeof(mca_pml_ob1_rendezvous_hdr_t),
 797                                   &size,
 798                                   MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
 799                                   MCA_BTL_DES_FLAGS_SIGNAL,
 800                                   &des );
 801         MEMCHECKER(
 802             memchecker_call(&opal_memchecker_base_mem_noaccess,
 803                             sendreq->req_send.req_base.req_addr,
 804                             sendreq->req_send.req_base.req_count,
 805                             sendreq->req_send.req_base.req_datatype);
 806         );
 807     }
 808 
 809     if( OPAL_UNLIKELY(NULL == des) ) {
 810         return OMPI_ERR_OUT_OF_RESOURCE;
 811     }
 812     segment = des->des_segments;
 813 
 814     /* build hdr */
 815     hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
 816     mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags |
 817                                         MCA_PML_OB1_HDR_FLAGS_SIGNAL,
 818                                         sendreq->req_send.req_base.req_comm->c_contextid,
 819                                         sendreq->req_send.req_base.req_comm->c_my_rank,
 820                                         sendreq->req_send.req_base.req_tag,
 821                                         (uint16_t)sendreq->req_send.req_base.req_sequence,
 822                                         sendreq->req_send.req_bytes_packed, sendreq);
 823 
 824     ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
 825 
 826     /* first fragment of a long message */
 827     des->des_cbdata = sendreq;
 828     des->des_cbfunc = mca_pml_ob1_rndv_completion;
 829 
 830     /* wait for ack and completion */
 831     sendreq->req_state = 2;
 832 
 833     /* send */
 834     rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
 835     if( OPAL_LIKELY( rc >= 0 ) ) {
 836         if( OPAL_LIKELY( 1 == rc ) ) {
 837             mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, size );
 838         }
 839         return OMPI_SUCCESS;
 840     }
 841     mca_bml_base_free(bml_btl, des );
 842     return rc;
 843 }
 844 
 845 void mca_pml_ob1_send_request_copy_in_out( mca_pml_ob1_send_request_t *sendreq,
 846                                            uint64_t send_offset,
 847                                            uint64_t send_length )
 848 {
 849     mca_pml_ob1_send_range_t *sr;
 850     opal_free_list_item_t *i;
 851     mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
 852     int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
 853     int n;
 854     double weight_total = 0;
 855 
 856     if( OPAL_UNLIKELY(0 == send_length) )
 857         return;
 858 
 859     i = opal_free_list_wait (&mca_pml_ob1.send_ranges);
 860 
 861     sr = (mca_pml_ob1_send_range_t*)i;
 862 
 863     sr->range_send_offset = send_offset;
 864     sr->range_send_length = send_length;
 865     sr->range_btl_idx = 0;
 866 
 867     for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) {
 868         sr->range_btls[n].bml_btl =
 869             mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
 870         weight_total += sr->range_btls[n].bml_btl->btl_weight;
 871     }
 872 
 873     sr->range_btl_cnt = n;
 874     mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length,
 875             weight_total);
 876 
 877     OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
 878     opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
 879     OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
 880 }
 881 
 882 static inline mca_pml_ob1_send_range_t *
 883 get_send_range_nolock(mca_pml_ob1_send_request_t* sendreq)
 884 {
 885     opal_list_item_t *item;
 886 
 887     item = opal_list_get_first(&sendreq->req_send_ranges);
 888 
 889     if(opal_list_get_end(&sendreq->req_send_ranges) == item)
 890         return NULL;
 891 
 892     return (mca_pml_ob1_send_range_t*)item;
 893 }
 894 
 895 static inline mca_pml_ob1_send_range_t *
 896 get_send_range(mca_pml_ob1_send_request_t* sendreq)
 897 {
 898     mca_pml_ob1_send_range_t *range;
 899 
 900     OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
 901     range = get_send_range_nolock(sendreq);
 902     OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
 903 
 904     return range;
 905 }
 906 
 907 static inline mca_pml_ob1_send_range_t *
 908 get_next_send_range(mca_pml_ob1_send_request_t* sendreq,
 909         mca_pml_ob1_send_range_t *range)
 910 {
 911     OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
 912     opal_list_remove_item(&sendreq->req_send_ranges, (opal_list_item_t *)range);
 913     opal_free_list_return (&mca_pml_ob1.send_ranges, &range->base);
 914     range = get_send_range_nolock(sendreq);
 915     OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
 916 
 917     return range;
 918 }
 919 
 920 /**
 921  *  Schedule pipeline of send descriptors for the given request.
 922  *  Up to the rdma threshold. If this is a send based protocol,
 923  *  the rdma threshold is the end of the message. Otherwise, schedule
 924  *  fragments up to the threshold to overlap initial registration/setup
 925  *  costs of the rdma. Only one thread can be inside this function.
 926  */
 927 
 928 int
 929 mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
 930 {
 931     size_t prev_bytes_remaining = 0;
 932     mca_pml_ob1_send_range_t *range;
 933     int num_fail = 0;
 934 
 935     /* check pipeline_depth here before attempting to get any locks */
 936     if(true == sendreq->req_throttle_sends &&
 937        sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
 938         return OMPI_SUCCESS;
 939 
 940     range = get_send_range(sendreq);
 941 
 942     while(range && (false == sendreq->req_throttle_sends ||
 943           sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
 944         mca_pml_ob1_frag_hdr_t* hdr;
 945         mca_btl_base_descriptor_t* des;
 946         int rc, btl_idx;
 947         size_t size, offset, data_remaining = 0;
 948         mca_bml_base_btl_t* bml_btl;
 949 
 950         assert(range->range_send_length != 0);
 951 
 952         if(prev_bytes_remaining == range->range_send_length)
 953             num_fail++;
 954         else
 955             num_fail = 0;
 956 
 957         prev_bytes_remaining = range->range_send_length;
 958 
 959         if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) {
 960             /*TODO : assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE); */
 961             add_request_to_send_pending(sendreq,
 962                     MCA_PML_OB1_SEND_PENDING_SCHEDULE, true);
 963             /* Note that request remains locked. send_request_process_pending()
 964              * function will call shedule_exclusive() directly without taking
 965              * the lock */
 966             return OMPI_ERR_OUT_OF_RESOURCE;
 967         }
 968 
 969 cannot_pack:
 970         do {
 971             btl_idx = range->range_btl_idx;
 972             if(++range->range_btl_idx == range->range_btl_cnt)
 973                 range->range_btl_idx = 0;
 974         } while(!range->range_btls[btl_idx].length);
 975 
 976         bml_btl = range->range_btls[btl_idx].bml_btl;
 977         /* If there is a remaining data from another BTL that was too small
 978          * for converter to pack then send it through another BTL */
 979         range->range_btls[btl_idx].length += data_remaining;
 980         size = range->range_btls[btl_idx].length;
 981 
 982         /* makes sure that we don't exceed BTL max send size */
 983         if(bml_btl->btl->btl_max_send_size != 0) {
 984 #if OPAL_CUDA_SUPPORT
 985             size_t max_send_size;
 986             if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && (bml_btl->btl->btl_cuda_max_send_size != 0)) {
 987                 max_send_size = bml_btl->btl->btl_cuda_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t);
 988             } else {
 989                 max_send_size = bml_btl->btl->btl_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t);
 990             }
 991 #else /* OPAL_CUDA_SUPPORT */
 992             size_t max_send_size = bml_btl->btl->btl_max_send_size -
 993                 sizeof(mca_pml_ob1_frag_hdr_t);
 994 #endif /* OPAL_CUDA_SUPPORT */
 995             if (size > max_send_size) {
 996                 size = max_send_size;
 997             }
 998         }
 999 
1000         /* pack into a descriptor */
1001         offset = (size_t)range->range_send_offset;
1002         opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
1003                                     &offset);
1004         range->range_send_offset = (uint64_t)offset;
1005 
1006         data_remaining = size;
1007         MEMCHECKER(
1008             memchecker_call(&opal_memchecker_base_mem_defined,
1009                             sendreq->req_send.req_base.req_addr,
1010                             sendreq->req_send.req_base.req_count,
1011                             sendreq->req_send.req_base.req_datatype);
1012         );
1013         mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
1014                                  MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
1015                                  &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
1016                                  MCA_BTL_DES_FLAGS_SIGNAL, &des);
1017         MEMCHECKER(
1018             memchecker_call(&opal_memchecker_base_mem_noaccess,
1019                             sendreq->req_send.req_base.req_addr,
1020                             sendreq->req_send.req_base.req_count,
1021                             sendreq->req_send.req_base.req_datatype);
1022         );
1023 
1024         if( OPAL_UNLIKELY(des == NULL || size == 0) ) {
1025             if(des) {
1026                 /* Converter can't pack this chunk. Append to another chunk
1027                  * from other BTL */
1028                 mca_bml_base_free(bml_btl, des);
1029                 range->range_btls[btl_idx].length -= data_remaining;
1030                 goto cannot_pack;
1031             }
1032             continue;
1033         }
1034 
1035         des->des_cbfunc = mca_pml_ob1_frag_completion;
1036         des->des_cbdata = sendreq;
1037 
1038         /* setup header */
1039         hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
1040         mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
1041                                       sendreq->req_recv.lval);
1042 
1043         ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
1044                 sendreq->req_send.req_base.req_proc);
1045 
1046 #if OMPI_WANT_PERUSE
1047          PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
1048                  &(sendreq->req_send.req_base), size, PERUSE_SEND);
1049 #endif  /* OMPI_WANT_PERUSE */
1050 
1051 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
1052          /* At this point, check to see if the BTL is doing an asynchronous
1053           * copy.  This would have been initiated in the mca_bml_base_prepare_src
1054           * called above.  The flag is checked here as we let the hdr be
1055           * set up prior to checking.
1056           */
1057         if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
1058             OPAL_OUTPUT((-1, "Initiating async copy on FRAG frag=%p", (void *)des));
1059             /* Need to make sure BTL does not free frag after completion
1060              * of asynchronous copy as we still need to send the fragment. */
1061             des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
1062             /* Unclear that this flag needs to be set but to be sure, set it */
1063             des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
1064             des->des_cbfunc = mca_pml_ob1_copy_frag_completion;
1065             range->range_btls[btl_idx].length -= size;
1066             range->range_send_length -= size;
1067             range->range_send_offset += size;
1068             OPAL_THREAD_ADD_FETCH32(&sendreq->req_pipeline_depth, 1);
1069             if(range->range_send_length == 0) {
1070                 range = get_next_send_range(sendreq, range);
1071                 prev_bytes_remaining = 0;
1072             }
1073             continue;
1074         }
1075 #endif /* OPAL_CUDA_SUPPORT */
1076 
1077         /* initiate send - note that this may complete before the call returns */
1078         rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
1079         if( OPAL_LIKELY(rc >= 0) ) {
1080             /* update state */
1081             range->range_btls[btl_idx].length -= size;
1082             range->range_send_length -= size;
1083             range->range_send_offset += size;
1084             OPAL_THREAD_ADD_FETCH32(&sendreq->req_pipeline_depth, 1);
1085             if(range->range_send_length == 0) {
1086                 range = get_next_send_range(sendreq, range);
1087                 prev_bytes_remaining = 0;
1088             }
1089         } else {
1090             mca_bml_base_free(bml_btl,des);
1091         }
1092     }
1093 
1094     return OMPI_SUCCESS;
1095 }
1096 
1097 
1098 /**
1099  * A put fragment could not be started. Queue the fragment to be retried later or
1100  * fall back on send/recv.
1101  */
1102 static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
1103 {
1104     mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
1105     mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
1106 
1107     if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
1108         /* queue the frag for later if there was a resource error */
1109         OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
1110         opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
1111         OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
1112     } else {
1113         /* tell receiver to deregister memory */
1114         mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1115                               frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
1116                               OPAL_ERR_TEMP_OUT_OF_RESOURCE);
1117 
1118         /* send fragment by copy in/out */
1119         mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
1120                                              frag->rdma_length);
1121         /* if a pointer to a receive request is not set it means that
1122          * ACK was not yet received. Don't schedule sends before ACK */
1123         if (NULL != sendreq->req_recv.pval)
1124             mca_pml_ob1_send_request_schedule (sendreq);
1125     }
1126 }
1127 
1128 /**
1129  *  An RDMA put operation has completed:
1130  *  (1) Update request status and if required set completed
1131  *  (2) Send FIN control message to the destination
1132  */
1133 
1134 static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
1135                                         void *local_address, mca_btl_base_registration_handle_t *local_handle,
1136                                         void *context, void *cbdata, int status)
1137 {
1138     mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
1139     mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
1140     mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
1141 
1142     /* check completion status */
1143     if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
1144         /* TODO -- read ordering */
1145         mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
1146                               frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
1147                               0, 0);
1148 
1149         /* check for request completion */
1150         OPAL_THREAD_ADD_FETCH_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
1151         SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)frag->rdma_length,
1152                         OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
1153 
1154         send_request_pml_complete_check(sendreq);
1155     } else {
1156         /* try to fall back on send/recv */
1157         mca_pml_ob1_send_request_put_frag_failed (frag, status);
1158     }
1159 
1160     MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
1161 
1162     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
1163 }
1164 
1165 int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
1166 {
1167     mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
1168     mca_btl_base_registration_handle_t *local_handle = NULL;
1169     mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
1170     int rc;
1171 
1172     if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
1173         /* Check if the segment is already registered */
1174         for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
1175             if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
1176                 /* do not copy the handle to the fragment to avoid deregistring it twice */
1177                 local_handle = sendreq->req_rdma[i].btl_reg;
1178                 break;
1179             }
1180         }
1181 
1182         if (NULL == frag->local_handle) {
1183             /* Not already registered. Register the region with the BTL. */
1184             mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
1185                                        &frag->local_handle);
1186 
1187             if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
1188                 mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
1189 
1190                 return OMPI_ERR_OUT_OF_RESOURCE;
1191             }
1192 
1193             local_handle = frag->local_handle;
1194         }
1195     }
1196 
1197     PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
1198                                   &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), frag->rdma_length, PERUSE_SEND );
1199 
1200     rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
1201                            (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
1202                            0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
1203     /* Count the bytes put even though they probably haven't been sent yet */
1204     SPC_RECORD(OMPI_SPC_BYTES_PUT, (ompi_spc_value_t)frag->rdma_length);
1205     if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
1206         mca_pml_ob1_send_request_put_frag_failed (frag, rc);
1207         return rc;
1208     }
1209 
1210     return OMPI_SUCCESS;
1211 }
1212 
1213 /**
1214  *  Receiver has scheduled an RDMA operation:
1215  *  (1) Allocate an RDMA fragment to maintain the state of the operation
1216  *  (2) Call BTL prepare_src to pin/prepare source buffers
1217  *  (3) Queue the RDMA put
1218  */
1219 
1220 void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
1221                                    mca_btl_base_module_t* btl,
1222                                    mca_pml_ob1_rdma_hdr_t* hdr )
1223 {
1224     mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
1225     mca_pml_ob1_rdma_frag_t* frag;
1226 
1227     if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
1228         OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, -1);
1229     }
1230 
1231     sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
1232 
1233     if (NULL == sendreq->rdma_frag) {
1234         MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
1235 
1236         if( OPAL_UNLIKELY(NULL == frag) ) {
1237             /* TSW - FIX */
1238             OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
1239             ompi_rte_abort(-1, NULL);
1240         }
1241     } else {
1242         /* rget fallback on put */
1243         frag = sendreq->rdma_frag;
1244         sendreq->rdma_frag = NULL;
1245         sendreq->req_state = 0;
1246     }
1247 
1248     /* copy registration data */
1249     memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
1250 
1251     frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
1252     frag->rdma_hdr.hdr_rdma = *hdr;
1253     frag->rdma_req = sendreq;
1254     frag->rdma_length = hdr->hdr_dst_size;
1255     frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
1256     frag->remote_address = hdr->hdr_dst_ptr;
1257     frag->retries = 0;
1258 
1259     /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
1260      * non-contiguous RDMA. If that changes this code will be wrong. */
1261     opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
1262                                        hdr->hdr_rdma_offset, &frag->local_address);
1263 
1264     mca_pml_ob1_send_request_put_frag(frag);
1265 }

/* [<][>][^][v][top][bottom][index][help] */