root/ompi/mca/osc/rdma/osc_rdma_comm.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_osc_rdma_cleanup_rdma
  2. ompi_osc_get_data_complete
  3. ompi_osc_get_data_blocking
  4. ompi_osc_rdma_master_noncontig
  5. ompi_osc_rdma_master
  6. ompi_osc_rdma_copy_local
  7. ompi_osc_rdma_put_complete
  8. ompi_osc_rdma_put_complete_flush
  9. ompi_osc_rdma_put_real
  10. ompi_osc_rdma_put_contig
  11. ompi_osc_rdma_get_complete
  12. ompi_osc_rdma_get_partial
  13. ompi_osc_rdma_get_contig
  14. ompi_osc_rdma_put_w_req
  15. ompi_osc_rdma_get_w_req
  16. ompi_osc_rdma_put
  17. ompi_osc_rdma_rput
  18. ompi_osc_rdma_get
  19. ompi_osc_rdma_rget

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2014-2018 Los Alamos National Security, LLC.  All rights
   4  *                         reserved.
   5  * Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
   6  * Copyright (c) 2017      Research Organization for Information Science
   7  *                         and Technology (RIST). All rights reserved.
   8  * Copyright (c) 2017      IBM Corporation. All rights reserved.
   9  * $COPYRIGHT$
  10  *
  11  * Additional copyrights may follow
  12  *
  13  * $HEADER$
  14  */
  15 
  16 #include "osc_rdma_comm.h"
  17 #include "osc_rdma_sync.h"
  18 #include "osc_rdma_request.h"
  19 #include "osc_rdma_dynamic.h"
  20 
  21 #include "ompi/mca/osc/base/osc_base_obj_convert.h"
  22 #include "opal/align.h"
  23 
  24 /* helper functions */
  25 static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag,
  26                                                mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
  27 {
  28     if (frag) {
  29         ompi_osc_rdma_frag_complete (frag);
  30     } else {
  31         ompi_osc_rdma_deregister (sync->module, handle);
  32     }
  33 
  34     if (request) {
  35         (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
  36     }
  37 
  38     if (dec_always) {
  39         ompi_osc_rdma_sync_rdma_dec_always (sync);
  40     } else {
  41         ompi_osc_rdma_sync_rdma_dec (sync);
  42     }
  43 }
  44 
  45 static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
  46                                      mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
  47                                      ompi_osc_rdma_request_t *request);
  48 
  49 static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
  50                                         void *local_address, mca_btl_base_registration_handle_t *local_handle,
  51                                         void *context, void *data, int status)
  52 {
  53     assert (OPAL_SUCCESS == status);
  54     ((bool *) context)[0]  = true;
  55 }
  56 
  57 int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
  58                                 uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
  59                                 void *data, size_t len)
  60 {
  61     const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
  62     mca_btl_base_registration_handle_t *local_handle = NULL;
  63     ompi_osc_rdma_frag_t *frag = NULL;
  64     volatile bool read_complete = false;
  65     size_t aligned_len, offset;
  66     uint64_t aligned_addr = source_address & ~btl_alignment_mask;
  67     char *ptr = data;
  68     int ret;
  69 
  70     offset = source_address & btl_alignment_mask;
  71     aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
  72 
  73     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading data from endpoint %p. source: 0x%" PRIx64 " (aligned: 0x%" PRIx64
  74                      "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len,
  75                      (unsigned long) aligned_len);
  76 
  77     if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) {
  78         do {
  79             ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
  80             if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) {
  81                 ompi_osc_rdma_progress (module);
  82             }
  83         } while (OMPI_ERR_OUT_OF_RESOURCE == ret);
  84 
  85         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
  86             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer");
  87             return ret;
  88         }
  89 
  90         local_handle = frag->handle;
  91         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr,
  92                          (void *) frag);
  93     }
  94 
  95     assert (!(source_address & ALIGNMENT_MASK(module->selected_btl->btl_get_alignment)));
  96 
  97     do {
  98         ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, aligned_addr,
  99                                              local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
 100                                              ompi_osc_get_data_complete, (void *) &read_complete, NULL);
 101         if (!ompi_osc_rdma_oor (ret)) {
 102             break;
 103         }
 104 
 105         ompi_osc_rdma_progress (module);
 106     } while (1);
 107 
 108     if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
 109         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret);
 110 
 111         if (frag) {
 112             ompi_osc_rdma_frag_complete (frag);
 113         }
 114 
 115         return ret;
 116     }
 117 
 118     /* block until the callback is called */
 119     while (!read_complete) {
 120         ompi_osc_rdma_progress (module);
 121     }
 122 
 123     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished reading state data from endpoint %p", (void *) endpoint);
 124 
 125     opal_memchecker_base_mem_defined (ptr, len);
 126 
 127     if (frag) {
 128         memcpy (data, ptr + offset, len);
 129 
 130         /* done with the fragment */
 131         ompi_osc_rdma_frag_complete (frag);
 132     }
 133 
 134     return OMPI_SUCCESS;
 135 }
 136 
 137 /**
 138  * @brief function signature for the rdma transfer function used by ompi_osc_rdma_master_noncontig()
 139  *
 140  * @param[in] peer            peer object for remote peer
 141  * @param[in] remote_address  base of remote region (destination for put, source for get)
 142  * @param[in] remote_handle   btl registration handle for remote region (must be valid for the entire region)
 143  * @param[in] local_address   base of local region (source for put, destination for get)
 144  * @param[in] size            number of bytes to transfer
 145  * @param[in] module          osc rdma module
 146  * @param[in] request         osc rdma request if used (can be NULL)
 147  *
 148  * @returns OMPI_SUCCESS on success
 149  * @returns OMPI_ERR_OUT_OF_RESOURCE on temporary error
 150  * @returns other OMPI error on fatal error
 151  *
 152  * This function does the work of scheduling a contiguous transfer between the local and remote regions.
 153  */
 154 typedef int (*ompi_osc_rdma_fn_t) (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
 155                                    mca_btl_base_registration_handle_t *remote_handle, void *local_address, size_t size,
 156                                    ompi_osc_rdma_request_t *request);
 157 
 158 /**
 159  * @brief break down rdma transaction into contiguous regions
 160  *
 161  * @param[in] local_address    base of local region (source for put, destination for get)
 162  * @param[in] local_count      number of elements in local region
 163  * @param[in] local_datatype   datatype of local region
 164  * @param[in] peer             peer object for remote peer
 165  * @param[in] remote_address   base of remote region (destination for put, source for get)
 166  * @param[in] remote_handle    btl registration handle for remote region (must be valid for the entire region)
 167  * @param[in] remote_count     number of elements in remote region
 168  * @param[in] remote_datatype  datatype of remote region
 169  * @param[in] module           osc rdma module
 170  * @param[in] request          osc rdma request if used (can be NULL)
 171  * @param[in] max_rdma_len     maximum length of an rdma request (usually btl limitation)
 172  * @param[in] rdma_fn          function to use for contiguous rdma operations
 173  * @param[in] alloc_reqs       true if rdma_fn requires a valid request object (any allocated objects will be marked internal)
 174  *
 175  * This function does the work of breaking a non-contiguous rdma transfer into contiguous components. It will
 176  * continue to submit rdma transfers until the entire region is transferred or a fatal error occurs.
 177  */
 178 static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count, ompi_datatype_t *local_datatype,
 179                                            ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
 180                                            mca_btl_base_registration_handle_t *remote_handle, int remote_count,
 181                                            ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
 182                                            const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
 183 {
 184     ompi_osc_rdma_module_t *module = sync->module;
 185     struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
 186     opal_convertor_t local_convertor, remote_convertor;
 187     uint32_t local_iov_count, remote_iov_count;
 188     uint32_t local_iov_index, remote_iov_index;
 189     /* needed for opal_convertor_raw but not used */
 190     size_t local_size, remote_size, rdma_len;
 191     ompi_osc_rdma_request_t *subreq;
 192     int ret;
 193     bool done;
 194 
 195     subreq = NULL;
 196 
 197     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s) or large region");
 198 
 199     /* prepare convertors for the source and target. these convertors will be used to determine the
 200      * contiguous segments within the source and target. */
 201     OBJ_CONSTRUCT(&remote_convertor, opal_convertor_t);
 202     ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &remote_datatype->super, remote_count,
 203                                                     (void *) (intptr_t) remote_address, 0, &remote_convertor);
 204     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 205         return ret;
 206     }
 207 
 208     OBJ_CONSTRUCT(&local_convertor, opal_convertor_t);
 209     ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &local_datatype->super, local_count,
 210                                                     local_address, 0, &local_convertor);
 211     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 212         return ret;
 213     }
 214 
 215     if (request) {
 216         /* keep the request from completing until all the transfers have started */
 217         request->outstanding_requests = 1;
 218     }
 219 
 220     local_iov_index = 0;
 221     local_iov_count = 0;
 222 
 223     do {
 224         /* decode segments of the remote data */
 225         remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
 226         remote_iov_index = 0;
 227 
 228         /* opal_convertor_raw returns true when it has reached the end of the data */
 229         done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size);
 230 
 231         /* loop on the target segments until we have exhaused the decoded source data */
 232         while (remote_iov_index != remote_iov_count) {
 233             if (local_iov_index == local_iov_count) {
 234                 /* decode segments of the target buffer */
 235                 local_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
 236                 local_iov_index = 0;
 237                 (void) opal_convertor_raw (&local_convertor, local_iovec, &local_iov_count, &local_size);
 238             }
 239 
 240             /* we already checked that the target was large enough. this should be impossible */
 241             assert (0 != local_iov_count);
 242 
 243             /* determine how much to transfer in this operation */
 244             rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len);
 245 
 246             /* execute the get */
 247             if (!subreq && alloc_reqs) {
 248                 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
 249                 subreq->internal = true;
 250                 subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
 251                 subreq->parent_request = request;
 252 
 253                 if (request) {
 254                     (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
 255                 }
 256             } else if (!alloc_reqs) {
 257                 subreq = request;
 258             }
 259 
 260             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing rdma on contiguous region. local: %p, remote: %p, len: %lu",
 261                              local_iovec[local_iov_index].iov_base, remote_iovec[remote_iov_index].iov_base,
 262                              (unsigned long) remote_iovec[remote_iov_index].iov_len);
 263 
 264             ret = rdma_fn (sync, peer, (uint64_t) (intptr_t) remote_iovec[remote_iov_index].iov_base, remote_handle,
 265                            local_iovec[local_iov_index].iov_base, rdma_len, subreq);
 266             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 267                 if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
 268                     if (request) {
 269                         ompi_osc_rdma_request_deref (request);
 270                     }
 271 
 272                     if (alloc_reqs) {
 273                         OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
 274                     }
 275 
 276                     /* something bad happened. need to figure out best way to handle rma errors */
 277                     return ret;
 278                 }
 279 
 280                 /* progress and try again */
 281                 ompi_osc_rdma_progress (module);
 282                 continue;
 283             }
 284             subreq = NULL;
 285 
 286             /* adjust io vectors */
 287             local_iovec[local_iov_index].iov_len -= rdma_len;
 288             remote_iovec[remote_iov_index].iov_len -= rdma_len;
 289             local_iovec[local_iov_index].iov_base = (void *)((intptr_t) local_iovec[local_iov_index].iov_base + rdma_len);
 290             remote_iovec[remote_iov_index].iov_base = (void *)((intptr_t) remote_iovec[remote_iov_index].iov_base + rdma_len);
 291 
 292             local_iov_index += (0 == local_iovec[local_iov_index].iov_len);
 293             remote_iov_index += (0 == remote_iovec[remote_iov_index].iov_len);
 294         }
 295     } while (!done);
 296 
 297     if (request) {
 298         /* release our reference so the request can complete */
 299         ompi_osc_rdma_request_deref (request);
 300     }
 301 
 302     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
 303 
 304     /* clean up convertors */
 305     opal_convertor_cleanup (&local_convertor);
 306     OBJ_DESTRUCT(&local_convertor);
 307     opal_convertor_cleanup (&remote_convertor);
 308     OBJ_DESTRUCT(&remote_convertor);
 309 
 310     return OMPI_SUCCESS;
 311 }
 312 
 313 static inline int ompi_osc_rdma_master (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count,
 314                                         ompi_datatype_t *local_datatype, ompi_osc_rdma_peer_t *peer,
 315                                         uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
 316                                         int remote_count, ompi_datatype_t *remote_datatype,
 317                                         ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
 318                                         const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
 319 {
 320     size_t rdma_len;
 321     ptrdiff_t lb, extent;
 322     int ret;
 323 
 324     rdma_len = local_datatype->super.size * local_count;
 325 
 326     /* fast path for contiguous rdma */
 327     if (OPAL_LIKELY(ompi_datatype_is_contiguous_memory_layout (local_datatype, local_count) &&
 328                     ompi_datatype_is_contiguous_memory_layout (remote_datatype, remote_count) &&
 329                     rdma_len <= max_rdma_len)) {
 330         if (NULL == request && alloc_reqs) {
 331             ompi_osc_rdma_module_t *module = sync->module;
 332             OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
 333             request->internal = true;
 334             request->type = OMPI_OSC_RDMA_TYPE_RDMA;
 335         }
 336 
 337         /* ignore failure here */
 338         (void) ompi_datatype_get_true_extent (local_datatype, &lb, &extent);
 339         local_address = (void *)((intptr_t) local_address + lb);
 340 
 341         (void) ompi_datatype_get_true_extent (remote_datatype, &lb, &extent);
 342         remote_address += lb;
 343 
 344         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing rdma on contiguous region. local: %p, "
 345                          "remote: 0x%lx, length: %lu", local_address, (unsigned long) remote_address,
 346                          rdma_len);
 347 
 348         do {
 349             ret = rdma_fn (sync, peer, remote_address, remote_handle, local_address, rdma_len, request);
 350             if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
 351                 return OMPI_SUCCESS;
 352             }
 353 
 354             ompi_osc_rdma_progress (sync->module);
 355         } while (1);
 356     }
 357 
 358     return ompi_osc_rdma_master_noncontig (sync, local_address, local_count, local_datatype, peer, remote_address,
 359                                            remote_handle, remote_count, remote_datatype, request,
 360                                            max_rdma_len, rdma_fn, alloc_reqs);
 361 }
 362 
 363 static int ompi_osc_rdma_copy_local (const void *source, int source_count, ompi_datatype_t *source_datatype,
 364                                      void *target, int target_count, ompi_datatype_t *target_datatype,
 365                                      ompi_osc_rdma_request_t *request)
 366 {
 367     int ret;
 368 
 369     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing local copy from %p -> %p", source, target);
 370 
 371     opal_atomic_mb ();
 372     ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype);
 373 
 374     if (request) {
 375         ompi_osc_rdma_request_complete (request, ret);
 376     }
 377 
 378     return ret;
 379 }
 380 
 381 static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
 382                                         void *local_address, mca_btl_base_registration_handle_t *local_handle,
 383                                         void *context, void *data, int status)
 384 {
 385     ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
 386 
 387     assert (OPAL_SUCCESS == status);
 388 
 389     /* the lowest bit is used as a flag indicating this put operation has a request */
 390     if ((intptr_t) context & 0x1) {
 391         ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
 392         sync = request->sync;
 393 
 394         /* NTH -- TODO: better error handling */
 395         ompi_osc_rdma_request_complete (request, status);
 396     }
 397 
 398     OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on sync %p. local "
 399                      "address %p. opal status %d", (void *) sync, local_address, status);
 400 
 401     if (data) {
 402         ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
 403     } else if (local_handle) {
 404         ompi_osc_rdma_deregister (sync->module, local_handle);
 405     }
 406 
 407     ompi_osc_rdma_sync_rdma_dec (sync);
 408 }
 409 
 410 static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
 411                                               void *local_address, mca_btl_base_registration_handle_t *local_handle,
 412                                               void *context, void *data, int status)
 413 {
 414     ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) context;
 415 
 416     assert (OPAL_SUCCESS == status);
 417 
 418     /* the lowest bit is used as a flag indicating this put operation has a request */
 419     if ((intptr_t) context & 0x1) {
 420         ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
 421         module = request->module;
 422 
 423         /* NTH -- TODO: better error handling */
 424         ompi_osc_rdma_request_complete (request, status);
 425     }
 426 
 427     OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on module %p. local "
 428                      "address %p. opal status %d", (void *) module, local_address, status);
 429 
 430     if (data) {
 431         ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
 432     } else if (local_handle) {
 433         ompi_osc_rdma_deregister (module, local_handle);
 434     }
 435 }
 436 
 437 static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
 438                                    mca_btl_base_registration_handle_t *target_handle, void *ptr,
 439                                    mca_btl_base_registration_handle_t *local_handle, size_t size,
 440                                    mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
 441     ompi_osc_rdma_module_t *module = sync->module;
 442     int ret;
 443 
 444     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync "
 445                      "object %p...", (unsigned long) size, target_address, (void *) sync);
 446 
 447     /* flag outstanding rma requests */
 448     ompi_osc_rdma_sync_rdma_inc (sync);
 449 
 450     do {
 451         ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
 452                                              local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
 453                                              cb, context, cbdata);
 454         if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
 455             return OMPI_SUCCESS;
 456         }
 457 
 458         ++module->put_retry_count;
 459 
 460         if (!ompi_osc_rdma_oor (ret)) {
 461             break;
 462         }
 463 
 464         /* spin a bit on progress */
 465         ompi_osc_rdma_progress (module);
 466     } while (1);
 467 
 468     OSC_RDMA_VERBOSE(10, "btl put failed with opal error code %d", ret);
 469 
 470     return ret;
 471 }
 472 
 473 int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
 474                               mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
 475                               ompi_osc_rdma_request_t *request)
 476 {
 477     ompi_osc_rdma_module_t *module = sync->module;
 478     mca_btl_base_registration_handle_t *local_handle = NULL;
 479     mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
 480     ompi_osc_rdma_frag_t *frag = NULL;
 481     char *ptr = source_buffer;
 482     void *cbcontext;
 483     int ret;
 484 
 485     if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) {
 486         ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr);
 487         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 488             ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle);
 489             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 490                 return ret;
 491             }
 492         } else {
 493             memcpy (ptr, source_buffer, size);
 494             local_handle = frag->handle;
 495         }
 496     }
 497 
 498     if (ompi_osc_rdma_use_btl_flush (module)) {
 499         /* NTH: when using the btl_flush function there is no guarantee that the callback will happen
 500          * before the flush is complete. because of this there is a chance that the sync object will be
 501          * released before there is a callback. to handle this case we call different callback that doesn't
 502          * use the sync object. its possible the btl sematics will change in the future and the callback
 503          * will happen *before* flush is considered complete. if that is the case this workaround can be
 504          * removed */
 505         cbcontext = (void *) module;
 506         if (request || local_handle || frag) {
 507             cbfunc = ompi_osc_rdma_put_complete_flush;
 508         }
 509         /* else the callback function is a no-op so do not bother specifying one */
 510     } else {
 511         cbcontext = (void *) sync;
 512         cbfunc = ompi_osc_rdma_put_complete;
 513     }
 514 
 515     /* increment the outstanding request counter in the request object */
 516     if (request) {
 517         (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
 518         cbcontext = (void *) ((intptr_t) request | 1);
 519         request->sync = sync;
 520     }
 521 
 522     ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, cbfunc,
 523                                   cbcontext, frag);
 524     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 525         ompi_osc_rdma_cleanup_rdma (sync, false, frag, local_handle, request);
 526     }
 527 
 528     return ret;
 529 }
 530 
 531 static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
 532                                         void *local_address, mca_btl_base_registration_handle_t *local_handle,
 533                                         void *context, void *data, int status)
 534 {
 535     ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
 536     intptr_t source = (intptr_t) local_address + request->offset;
 537     ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
 538     ompi_osc_rdma_sync_t *sync = request->sync;
 539     void *origin_addr = request->origin_addr;
 540 
 541     OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl get complete on sync %p. local "
 542                      "address %p. origin %p. opal status %d", (void *) sync, local_address, origin_addr, status);
 543 
 544     assert (OPAL_SUCCESS == status);
 545 
 546     if (request->buffer || frag) {
 547         if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
 548             memcpy (origin_addr, (void *) source, request->len);
 549         }
 550     }
 551 
 552     if (NULL == request->buffer) {
 553         /* completion detection can handle this case without the counter when using btl_flush */
 554         ompi_osc_rdma_sync_rdma_dec (sync);
 555     } else {
 556         /* the counter was needed to keep track of the number of outstanding operations */
 557         ompi_osc_rdma_sync_rdma_dec_always (sync);
 558     }
 559 
 560     if (NULL != frag) {
 561         ompi_osc_rdma_frag_complete (frag);
 562     } else {
 563         ompi_osc_rdma_deregister (sync->module, local_handle);
 564     }
 565 
 566     ompi_osc_rdma_request_complete (request, status);
 567 }
 568 
 569 static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
 570                                       mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
 571                                       ompi_osc_rdma_request_t *request) {
 572     ompi_osc_rdma_module_t *module = sync->module;
 573     ompi_osc_rdma_request_t *subreq;
 574     int ret;
 575 
 576     OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
 577     subreq->internal = true;
 578     subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
 579     subreq->parent_request = request;
 580     (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
 581 
 582     ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq);
 583     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 584         OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
 585         ompi_osc_rdma_request_deref (request);
 586     }
 587 
 588     return ret;
 589 }
 590 
 591 static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
 592                                      mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
 593                                      ompi_osc_rdma_request_t *request)
 594 {
 595     ompi_osc_rdma_module_t *module = sync->module;
 596     const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
 597     mca_btl_base_registration_handle_t *local_handle = NULL;
 598     ompi_osc_rdma_frag_t *frag = NULL;
 599     osc_rdma_size_t aligned_len;
 600     osc_rdma_base_t aligned_source_base, aligned_source_bound;
 601     char *ptr = target_buffer;
 602     bool counter_needs_inc = false;
 603     int ret;
 604 
 605     aligned_source_base = source_address & ~btl_alignment_mask;
 606     aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
 607     aligned_len = aligned_source_bound - aligned_source_base;
 608 
 609     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p",
 610                      size, source_address, target_buffer);
 611 
 612     if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
 613         (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
 614 
 615         ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
 616         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 617             if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
 618                 /* region is too large for a buffered read */
 619                 size_t subsize;
 620 
 621                 if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) {
 622                     /* remote region has the same alignment but the base is not aligned. perform a small
 623                      * buffered get of the beginning of the remote region */
 624                     aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t);
 625                     subsize = (size_t) (aligned_source_base - source_address);
 626 
 627                     ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request);
 628                     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 629                         return ret;
 630                     }
 631 
 632                     source_address += subsize;
 633                     target_buffer = (void *) ((intptr_t) target_buffer + subsize);
 634                     size -= subsize;
 635 
 636                     aligned_len = aligned_source_bound - aligned_source_base;
 637                 }
 638 
 639                 if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) &&
 640                     (size & btl_alignment_mask)) {
 641                     /* remote region bases are aligned but the bounds are not. perform a
 642                      * small buffered get of the end of the remote region */
 643                     aligned_len = size & ~btl_alignment_mask;
 644                     subsize = size - aligned_len;
 645                     size = aligned_len;
 646                     ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle,
 647                                                      (void *) ((intptr_t) target_buffer + aligned_len), subsize, request);
 648                     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 649                         return ret;
 650                     }
 651                 }
 652                 /* (remaining) user request is now correctly aligned */
 653             }
 654 
 655             if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
 656                 /* local and remote alignments differ */
 657                 request->buffer = ptr = malloc (aligned_len);
 658             } else {
 659                 ptr = target_buffer;
 660             }
 661 
 662             if (NULL != ptr) {
 663                 (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE,
 664                                                &local_handle);
 665             }
 666 
 667             if (OPAL_UNLIKELY(NULL == local_handle)) {
 668                 free (request->buffer);
 669                 request->buffer = NULL;
 670                 return ret;
 671             }
 672         } else {
 673             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx",
 674                              (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base);
 675             local_handle = frag->handle;
 676         }
 677     }
 678 
 679     request->offset = source_address - aligned_source_base;
 680     request->len = size;
 681     request->origin_addr = target_buffer;
 682     request->sync = sync;
 683 
 684     if (request->buffer) {
 685         /* always increment the outstanding RDMA counter as the btl_flush function does not guarantee callback completion,
 686          * just operation completion. */
 687         counter_needs_inc = true;
 688         ompi_osc_rdma_sync_rdma_inc_always (sync);
 689     } else {
 690         /* if this operation is being buffered with a frag then ompi_osc_rdma_sync_rdma_complete() can use the number
 691          * of pending operations on the rdma_frag as an indicator as to whether the operation is complete. this can
 692          * only be done since there is only on rdma frag per module. if that changes this logic will need to be changed
 693          * as well. this path also covers the case where the get operation is not buffered. */
 694         ompi_osc_rdma_sync_rdma_inc (sync);
 695     }
 696 
 697     do {
 698         ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
 699                                              aligned_source_base, local_handle, source_handle,
 700                                              aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
 701                                              request, frag);
 702         if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
 703             return OMPI_SUCCESS;
 704         }
 705 
 706         ++module->get_retry_count;
 707 
 708         if (!ompi_osc_rdma_oor (ret)) {
 709             break;
 710         }
 711 
 712         /* spin a bit on progress */
 713         for (int i = 0 ; i < 10 ; ++i) {
 714             ompi_osc_rdma_progress (module);
 715         }
 716     } while (1);
 717 
 718     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret);
 719 
 720     ompi_osc_rdma_cleanup_rdma (sync, counter_needs_inc, frag, local_handle, request);
 721 
 722     return ret;
 723 }
 724 
 725 static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
 726                                            ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer,
 727                                            ptrdiff_t target_disp, int target_count,
 728                                            ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
 729 {
 730     ompi_osc_rdma_module_t *module = sync->module;
 731     mca_btl_base_registration_handle_t *target_handle;
 732     uint64_t target_address;
 733     int ret;
 734 
 735     /* short-circuit case */
 736     if (0 == origin_count || 0 == target_count) {
 737         if (request) {
 738             ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
 739         }
 740 
 741         return OMPI_SUCCESS;
 742     }
 743 
 744     ptrdiff_t len, offset;
 745     // a buffer defined by (buf, count, dt)
 746     // will have data starting at buf+offset and ending len bytes later:
 747     len = opal_datatype_span(&target_datatype->super, target_count, &offset);
 748 
 749     // the below function wants arg4 to be the number of bytes after
 750     // source_disp that the data ends, which is offset+len
 751     ret = osc_rdma_get_remote_segment (module, peer, target_disp, offset+len,
 752                                        &target_address, &target_handle);
 753     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 754         return ret;
 755     }
 756 
 757     /* optimize communication with peers that we can do direct load and store operations on */
 758     if (ompi_osc_rdma_peer_local_base (peer)) {
 759         return ompi_osc_rdma_copy_local (origin_addr, origin_count, origin_datatype, (void *) (intptr_t) target_address,
 760                                          target_count, target_datatype, request);
 761     }
 762 
 763     return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle,
 764                                  target_count, target_datatype, request, module->selected_btl->btl_put_limit,
 765                                  ompi_osc_rdma_put_contig, false);
 766 }
 767 
 768 static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
 769                                            ompi_osc_rdma_peer_t *peer, ptrdiff_t source_disp, int source_count,
 770                                            ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
 771 {
 772     ompi_osc_rdma_module_t *module = sync->module;
 773     mca_btl_base_registration_handle_t *source_handle;
 774     uint64_t source_address;
 775     ptrdiff_t source_span, source_lb;
 776     int ret;
 777 
 778     /* short-circuit case */
 779     if (0 == origin_count || 0 == source_count) {
 780         if (request) {
 781             ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
 782         }
 783 
 784         return OMPI_SUCCESS;
 785     }
 786 
 787     // a buffer defined by (buf, count, dt)
 788     // will have data starting at buf+offset and ending len bytes later:
 789     source_span = opal_datatype_span(&source_datatype->super, source_count, &source_lb);
 790 
 791     ret = osc_rdma_get_remote_segment (module, peer, source_disp, source_span+source_lb,
 792                                        &source_address, &source_handle);
 793     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 794         return ret;
 795     }
 796 
 797     /* optimize self/local communication */
 798     if (ompi_osc_rdma_peer_local_base (peer)) {
 799         return ompi_osc_rdma_copy_local ((void *) (intptr_t) source_address, source_count, source_datatype,
 800                                          origin_addr, origin_count, origin_datatype, request);
 801     }
 802 
 803     return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address,
 804                                  source_handle, source_count, source_datatype, request,
 805                                  module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true);
 806 }
 807 int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
 808                        int target_rank, ptrdiff_t target_disp, int target_count,
 809                        ompi_datatype_t *target_datatype, ompi_win_t *win)
 810 {
 811     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 812     ompi_osc_rdma_peer_t *peer;
 813     ompi_osc_rdma_sync_t *sync;
 814 
 815     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
 816                      origin_count, origin_datatype->name, target_rank, (int) target_disp, target_count,
 817                      target_datatype->name, win->w_name);
 818 
 819     sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
 820     if (OPAL_UNLIKELY(NULL == sync)) {
 821         return OMPI_ERR_RMA_SYNC;
 822     }
 823 
 824     return ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
 825                                     target_count, target_datatype, NULL);
 826 }
 827 
 828 int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
 829                         int target_rank, ptrdiff_t target_disp, int target_count,
 830                         ompi_datatype_t *target_datatype, ompi_win_t *win,
 831                         ompi_request_t **request)
 832 {
 833     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 834     ompi_osc_rdma_peer_t *peer;
 835     ompi_osc_rdma_request_t *rdma_request;
 836     ompi_osc_rdma_sync_t *sync;
 837     int ret;
 838 
 839     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "rput: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr, origin_count,
 840                      origin_datatype->name, target_rank, (int) target_disp, target_count, target_datatype->name, win->w_name);
 841 
 842     sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
 843     if (OPAL_UNLIKELY(NULL == sync)) {
 844         return OMPI_ERR_RMA_SYNC;
 845     }
 846 
 847     OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
 848 
 849     rdma_request->type = OMPI_OSC_RDMA_TYPE_PUT;
 850 
 851     ret = ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
 852                                    target_count, target_datatype, rdma_request);
 853     if (OMPI_SUCCESS != ret) {
 854         OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
 855         return ret;
 856     }
 857 
 858     *request = (ompi_request_t *) rdma_request;
 859 
 860     return OMPI_SUCCESS;
 861 }
 862 
 863 int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
 864                        int source_rank, ptrdiff_t source_disp, int source_count,
 865                        ompi_datatype_t *source_datatype, ompi_win_t *win)
 866 {
 867     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 868     ompi_osc_rdma_peer_t *peer;
 869     ompi_osc_rdma_sync_t *sync;
 870 
 871     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
 872                      origin_count, origin_datatype->name, source_rank, (int) source_disp, source_count,
 873                      source_datatype->name, win->w_name);
 874 
 875     sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
 876     if (OPAL_UNLIKELY(NULL == sync)) {
 877         return OMPI_ERR_RMA_SYNC;
 878     }
 879 
 880     return ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
 881                                     source_disp, source_count, source_datatype, NULL);
 882 }
 883 
 884 int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
 885                         int source_rank, ptrdiff_t source_disp, int source_count,
 886                         ompi_datatype_t *source_datatype, ompi_win_t *win,
 887                         ompi_request_t **request)
 888 {
 889     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 890     ompi_osc_rdma_peer_t *peer;
 891     ompi_osc_rdma_request_t *rdma_request;
 892     ompi_osc_rdma_sync_t *sync;
 893     int ret;
 894 
 895     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "rget: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
 896                      origin_count, origin_datatype->name, source_rank, (int) source_disp, source_count,
 897                      source_datatype->name, win->w_name);
 898 
 899     sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
 900     if (OPAL_UNLIKELY(NULL == sync)) {
 901         return OMPI_ERR_RMA_SYNC;
 902     }
 903 
 904     OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
 905 
 906     rdma_request->type = OMPI_OSC_RDMA_TYPE_GET;
 907     ret = ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
 908                                    source_disp, source_count, source_datatype, rdma_request);
 909     if (OMPI_SUCCESS != ret) {
 910         OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
 911         return ret;
 912     }
 913 
 914     *request = (ompi_request_t *) rdma_request;
 915 
 916     return OMPI_SUCCESS;
 917 }

/* [<][>][^][v][top][bottom][index][help] */