root/ompi/mca/pml/ob1/pml_ob1_cuda.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_pml_ob1_send_request_start_cuda
  2. mca_pml_ob1_rdma_cuda_btls
  3. mca_pml_ob1_cuda_need_buffers
  4. mca_pml_ob1_cuda_add_ipc_support

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2008 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
  14  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
  15  * Copyright (c) 2012-2015 NVIDIA Corporation.  All rights reserved.
  16  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
  17  *                         reserved.
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 
  25 
  26 #include "ompi_config.h"
  27 #include "opal/prefetch.h"
  28 #include "opal/runtime/opal_params.h"
  29 #include "opal/mca/btl/btl.h"
  30 #include "opal/mca/mpool/mpool.h"
  31 #include "ompi/constants.h"
  32 #include "ompi/mca/pml/pml.h"
  33 #include "pml_ob1.h"
  34 #include "pml_ob1_hdr.h"
  35 #include "pml_ob1_rdmafrag.h"
  36 #include "pml_ob1_recvreq.h"
  37 #include "pml_ob1_sendreq.h"
  38 #include "ompi/mca/bml/base/base.h"
  39 #include "ompi/memchecker.h"
  40 
  41 size_t mca_pml_ob1_rdma_cuda_btls(
  42     mca_bml_base_endpoint_t* bml_endpoint,
  43     unsigned char* base,
  44     size_t size,
  45     mca_pml_ob1_com_btl_t* rdma_btls);
  46 
  47 int mca_pml_ob1_cuda_need_buffers(void * rreq,
  48                                   mca_btl_base_module_t* btl);
  49 
  50 void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
  51                                       ompi_proc_t* errproc, char* btlinfo);
  52 
  53 /**
  54  * Handle the CUDA buffer.
  55  */
  56 int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
  57                                         mca_bml_base_btl_t* bml_btl,
  58                                         size_t size) {
  59     int rc;
  60 #if OPAL_CUDA_GDR_SUPPORT
  61     /* With some BTLs, switch to RNDV from RGET at large messages */
  62     if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
  63         (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
  64         return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
  65     }
  66 #endif /* OPAL_CUDA_GDR_SUPPORT */
  67 
  68     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
  69     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
  70         unsigned char *base;
  71         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
  72         /* Set flag back */
  73         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
  74         if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
  75                                                                            sendreq->req_endpoint,
  76                                                                            base,
  77                                                                            sendreq->req_send.req_bytes_packed,
  78                                                                            sendreq->req_rdma))) {
  79             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
  80                                                      sendreq->req_send.req_bytes_packed);
  81             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
  82                 mca_pml_ob1_free_rdma_resources(sendreq);
  83             }
  84         } else {
  85             if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
  86                 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
  87                                                          MCA_PML_OB1_HDR_FLAGS_CONTIG);
  88             } else {
  89                 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
  90             }
  91         }
  92     } else {
  93         /* Do not send anything with first rendezvous message as copying GPU
  94          * memory into RNDV message is expensive. */
  95         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
  96         rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
  97     }
  98     return rc;
  99 }
 100 
 101 
 102 
 103 size_t mca_pml_ob1_rdma_cuda_btls(
 104     mca_bml_base_endpoint_t* bml_endpoint,
 105     unsigned char* base,
 106     size_t size,
 107     mca_pml_ob1_com_btl_t* rdma_btls)
 108 {
 109     int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
 110     double weight_total = 0;
 111     int num_btls_used = 0, n;
 112 
 113     /* shortcut when there are no rdma capable btls */
 114     if(num_btls == 0) {
 115         return 0;
 116     }
 117 
 118     /* check to see if memory is registered */
 119     for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
 120             n++) {
 121         mca_bml_base_btl_t* bml_btl =
 122             mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
 123 
 124         if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
 125             mca_btl_base_registration_handle_t *handle = NULL;
 126 
 127             if( NULL != bml_btl->btl->btl_register_mem ) {
 128                 /* register the memory */
 129                 handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint,
 130                                                          base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM |
 131                                                          MCA_BTL_REG_FLAG_REMOTE_READ);
 132             }
 133 
 134             if(NULL == handle)
 135                 continue;
 136 
 137             rdma_btls[num_btls_used].bml_btl = bml_btl;
 138             rdma_btls[num_btls_used].btl_reg = handle;
 139             weight_total += bml_btl->btl_weight;
 140             num_btls_used++;
 141         }
 142     }
 143 
 144     /* if we don't use leave_pinned and all BTLs that already have this memory
 145      * registered amount to less then half of available bandwidth - fall back to
 146      * pipeline protocol */
 147     if(0 == num_btls_used || (!opal_leave_pinned && weight_total < 0.5))
 148         return 0;
 149 
 150     mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
 151                                      weight_total);
 152 
 153     return num_btls_used;
 154 }
 155 
 156 int mca_pml_ob1_cuda_need_buffers(void * rreq,
 157                                   mca_btl_base_module_t* btl)
 158 {
 159     mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;
 160     mca_bml_base_endpoint_t* bml_endpoint = mca_bml_base_get_endpoint (recvreq->req_recv.req_base.req_proc);
 161     mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);
 162 
 163     /* A btl could be in the rdma list but not in the send list so check there also */
 164     if (NULL == bml_btl) {
 165         bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
 166     }
 167     /* We should always be able to find back the bml_btl based on the btl */
 168     assert(NULL != bml_btl);
 169 
 170     if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
 171         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
 172         recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
 173         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
 174             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
 175             return true;
 176         } else {
 177             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
 178             return false;
 179         }
 180     }
 181     return true;
 182 }
 183 
 184 /*
 185  * This function enables us to start using RDMA get protocol with GPU buffers.
 186  * We do this by adjusting the flags in the BML structure.  This is not the
 187  * best thing, but this may go away if CUDA IPC is supported everywhere in the
 188  * future. */
 189 void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
 190                                       ompi_proc_t* errproc, char* btlinfo)
 191 {
 192     mca_bml_base_endpoint_t* ep;
 193     int btl_verbose_stream = 0;
 194     int i;
 195 
 196     assert(NULL != errproc);
 197     assert(NULL != errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
 198     if (NULL != btlinfo) {
 199         btl_verbose_stream = *(int *)btlinfo;
 200     }
 201     ep = (mca_bml_base_endpoint_t*)errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 202 
 203     /* Find the corresponding bml and adjust the flag to support CUDA get */
 204     for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
 205         if( ep->btl_send.bml_btls[i].btl == btl ) {
 206             ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
 207             opal_output_verbose(5, btl_verbose_stream,
 208                         "BTL %s: rank=%d enabling CUDA IPC "
 209                         "to rank=%d on node=%s \n",
 210                         btl->btl_component->btl_version.mca_component_name,
 211                         OMPI_PROC_MY_NAME->vpid,
 212                         ((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
 213                         errproc->super.proc_hostname);
 214         }
 215     }
 216 }

/* [<][>][^][v][top][bottom][index][help] */