opal/mca/common/cuda/common

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
mca_common_cuda_register_mca_variables
mca_common_cuda_stage_one_init
mca_common_cuda_stage_two_init
mca_common_cuda_stage_three_init
mca_common_cuda_fini
mca_common_cuda_register
mca_common_cuda_unregister
cuda_getmemhandle
cuda_ungetmemhandle
cuda_openmemhandle
cuda_closememhandle
mca_common_cuda_construct_event_and_handle
mca_common_cuda_destruct_event
mca_common_wait_stream_synchronize
mca_common_cuda_memcpy
mca_common_cuda_record_dtoh_event
mca_common_cuda_record_htod_event
mca_common_cuda_get_dtoh_stream
mca_common_cuda_get_htod_stream
progress_one_cuda_ipc_event
progress_one_cuda_dtoh_event
progress_one_cuda_htod_event
mca_common_cuda_memhandle_matches
cuda_dump_memhandle
cuda_dump_evthandle
mydifftime
mca_common_cuda_is_gpu_buffer
mca_common_cuda_cu_memcpy_async
mca_common_cuda_cu_memcpy
mca_common_cuda_memmove
mca_common_cuda_get_device
mca_common_cuda_device_can_access_peer
mca_common_cuda_get_address_range
mca_common_cuda_previously_freed_memory
mca_common_cuda_get_buffer_id
   1 /*
   2  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2014 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2006 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
  13  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2015      Research Organization for Information Science
  15  *                         and Technology (RIST). All rights reserved.
  16  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 /**
  25  * This file contains various support functions for doing CUDA
  26  * operations.
  27  */
  28 #include "opal_config.h"
  29 
  30 #include <errno.h>
  31 #include <unistd.h>
  32 #include <cuda.h>
  33 
  34 #include "opal/align.h"
  35 #include "opal/datatype/opal_convertor.h"
  36 #include "opal/datatype/opal_datatype_cuda.h"
  37 #include "opal/util/output.h"
  38 #include "opal/util/show_help.h"
  39 #include "opal/util/proc.h"
  40 #include "opal/util/argv.h"
  41 #include "opal/util/printf.h"
  42 
  43 #include "opal/mca/rcache/base/base.h"
  44 #include "opal/runtime/opal_params.h"
  45 #include "opal/mca/timer/base/base.h"
  46 #include "opal/mca/dl/base/base.h"
  47 
  48 #include "common_cuda.h"
  49 
  50 /**
  51  * Since function names can get redefined in cuda.h file, we need to do this
  52  * stringifying to get the latest function name from the header file.  For
  53  * example, cuda.h may have something like this:
  54  * #define cuMemFree cuMemFree_v2
  55  * We want to make sure we find cuMemFree_v2, not cuMemFree.
  56  */
  57 #define STRINGIFY2(x) #x
  58 #define STRINGIFY(x) STRINGIFY2(x)
  59 
  60 #define OPAL_CUDA_DLSYM(libhandle, funcName)                                         \
  61 do {                                                                                 \
  62  char *err_msg;                                                                      \
  63  void *ptr;                                                                          \
  64  if (OPAL_SUCCESS !=                                                                 \
  65      opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) {               \
  66         opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true,             \
  67                        STRINGIFY(funcName), err_msg);                                \
  68         return 1;                                                                    \
  69     } else {                                                                         \
  70         *(void **)(&cuFunc.funcName) = ptr;                                          \
  71         opal_output_verbose(15, mca_common_cuda_output,                              \
  72                             "CUDA: successful dlsym of %s",                          \
  73                             STRINGIFY(funcName));                                    \
  74     }                                                                                \
  75 } while (0)
  76 
  77 /* Structure to hold CUDA function pointers that get dynamically loaded. */
  78 struct cudaFunctionTable {
  79     int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
  80     int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
  81     int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
  82     int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
  83     int (*cuMemFree)(CUdeviceptr buf);
  84     int (*cuCtxGetCurrent)(void *cuContext);
  85     int (*cuStreamCreate)(CUstream *, int);
  86     int (*cuEventCreate)(CUevent *, int);
  87     int (*cuEventRecord)(CUevent, CUstream);
  88     int (*cuMemHostRegister)(void *, size_t, unsigned int);
  89     int (*cuMemHostUnregister)(void *);
  90     int (*cuEventQuery)(CUevent);
  91     int (*cuEventDestroy)(CUevent);
  92     int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
  93     int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
  94     int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
  95     int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
  96     int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
  97     int (*cuIpcCloseMemHandle)(CUdeviceptr);
  98     int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
  99     int (*cuCtxGetDevice)(CUdevice *);
 100     int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
 101     int (*cuDeviceGet)(CUdevice *, int);
 102 #if OPAL_CUDA_GDR_SUPPORT
 103     int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
 104 #endif /* OPAL_CUDA_GDR_SUPPORT */
 105     int (*cuCtxSetCurrent)(CUcontext);
 106     int (*cuEventSynchronize)(CUevent);
 107     int (*cuStreamSynchronize)(CUstream);
 108     int (*cuStreamDestroy)(CUstream);
 109 #if OPAL_CUDA_GET_ATTRIBUTES
 110     int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
 111 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
 112 };
 113 typedef struct cudaFunctionTable cudaFunctionTable_t;
 114 static cudaFunctionTable_t cuFunc;
 115 
 116 static int stage_one_init_ref_count = 0;
 117 static bool stage_three_init_complete = false;
 118 static bool common_cuda_initialized = false;
 119 static bool common_cuda_mca_parames_registered = false;
 120 static int mca_common_cuda_verbose;
 121 static int mca_common_cuda_output = 0;
 122 bool mca_common_cuda_enabled = false;
 123 static bool mca_common_cuda_register_memory = true;
 124 static bool mca_common_cuda_warning = false;
 125 static opal_list_t common_cuda_memory_registrations;
 126 static CUstream ipcStream = NULL;
 127 static CUstream dtohStream = NULL;
 128 static CUstream htodStream = NULL;
 129 static CUstream memcpyStream = NULL;
 130 static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
 131 static opal_mutex_t common_cuda_init_lock;
 132 static opal_mutex_t common_cuda_htod_lock;
 133 static opal_mutex_t common_cuda_dtoh_lock;
 134 static opal_mutex_t common_cuda_ipc_lock;
 135 
 136 /* Functions called by opal layer - plugged into opal function table */
 137 static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
 138 static int mca_common_cuda_memmove(void*, void*, size_t);
 139 static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
 140 static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
 141 
 142 /* Function that gets plugged into opal layer */
 143 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *);
 144 
 145 /* Structure to hold memory registrations that are delayed until first
 146  * call to send or receive a GPU pointer */
 147 struct common_cuda_mem_regs_t {
 148     opal_list_item_t super;
 149     void *ptr;
 150     size_t amount;
 151     char *msg;
 152 };
 153 typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t;
 154 OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t);
 155 OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
 156                    opal_list_item_t,
 157                    NULL,
 158                    NULL);
 159 
 160 static int mca_common_cuda_async = 1;
 161 static int mca_common_cuda_cumemcpy_async;
 162 #if OPAL_ENABLE_DEBUG
 163 static int mca_common_cuda_cumemcpy_timing;
 164 #endif /* OPAL_ENABLE_DEBUG */
 165 
 166 /* Array of CUDA events to be queried for IPC stream, sending side and
 167  * receiving side. */
 168 CUevent *cuda_event_ipc_array = NULL;
 169 CUevent *cuda_event_dtoh_array = NULL;
 170 CUevent *cuda_event_htod_array = NULL;
 171 
 172 /* Array of fragments currently being moved by cuda async non-blocking
 173  * operations */
 174 struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
 175 struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
 176 struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
 177 
 178 /* First free/available location in cuda_event_status_array */
 179 static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
 180 
 181 /* First currently-being used location in the cuda_event_status_array */
 182 static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
 183 
 184 /* Number of status items currently in use */
 185 static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
 186 
 187 /* Size of array holding events */
 188 int cuda_event_max = 400;
 189 static int cuda_event_ipc_most = 0;
 190 static int cuda_event_dtoh_most = 0;
 191 static int cuda_event_htod_most = 0;
 192 
 193 /* Handle to libcuda.so */
 194 opal_dl_handle_t *libcuda_handle = NULL;
 195 
 196 /* Unused variable that we register at init time and unregister at fini time.
 197  * This is used to detect if user has done a device reset prior to MPI_Finalize.
 198  * This is a workaround to avoid SEGVs.
 199  */
 200 static int checkmem;
 201 static int ctx_ok = 1;
 202 
 203 #define CUDA_COMMON_TIMING 0
 204 #if OPAL_ENABLE_DEBUG
 205 /* Some timing support structures.  Enable this to help analyze
 206  * internal performance issues. */
 207 static opal_timer_t ts_start;
 208 static opal_timer_t ts_end;
 209 static double accum;
 210 #define THOUSAND  1000L
 211 #define MILLION   1000000L
 212 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end);
 213 #endif /* OPAL_ENABLE_DEBUG */
 214 
 215 /* These functions are typically unused in the optimized builds. */
 216 static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
 217 static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
 218 #if OPAL_ENABLE_DEBUG
 219 #define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
 220 #define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
 221 #else
 222 #define CUDA_DUMP_MEMHANDLE(a)
 223 #define CUDA_DUMP_EVTHANDLE(a)
 224 #endif /* OPAL_ENABLE_DEBUG */
 225 
 226 /* This is a seperate function so we can see these variables with ompi_info and
 227  * also set them with the tools interface */
 228 void mca_common_cuda_register_mca_variables(void)
 229 {
 230 
 231     if (false == common_cuda_mca_parames_registered) {
 232         common_cuda_mca_parames_registered = true;
 233     }
 234     /* Set different levels of verbosity in the cuda related code. */
 235     mca_common_cuda_verbose = 0;
 236     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
 237                                  "Set level of common cuda verbosity",
 238                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 239                                  OPAL_INFO_LVL_9,
 240                                  MCA_BASE_VAR_SCOPE_READONLY,
 241                                  &mca_common_cuda_verbose);
 242 
 243     /* Control whether system buffers get CUDA pinned or not.  Allows for
 244      * performance analysis. */
 245     mca_common_cuda_register_memory = true;
 246     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory",
 247                                  "Whether to cuMemHostRegister preallocated BTL buffers",
 248                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
 249                                  OPAL_INFO_LVL_9,
 250                                  MCA_BASE_VAR_SCOPE_READONLY,
 251                                  &mca_common_cuda_register_memory);
 252 
 253     /* Control whether we see warnings when CUDA memory registration fails.  This is
 254      * useful when CUDA support is configured in, but we are running a regular MPI
 255      * application without CUDA. */
 256     mca_common_cuda_warning = true;
 257     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning",
 258                                  "Whether to print warnings when CUDA registration fails",
 259                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
 260                                  OPAL_INFO_LVL_9,
 261                                  MCA_BASE_VAR_SCOPE_READONLY,
 262                                  &mca_common_cuda_warning);
 263 
 264     /* Use this flag to test async vs sync copies */
 265     mca_common_cuda_async = 1;
 266     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
 267                                  "Set to 0 to force CUDA sync copy instead of async",
 268                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 269                                  OPAL_INFO_LVL_9,
 270                                  MCA_BASE_VAR_SCOPE_READONLY,
 271                                  &mca_common_cuda_async);
 272 
 273     /* Use this parameter to increase the number of outstanding events allows */
 274     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max",
 275                                  "Set number of oustanding CUDA events",
 276                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 277                                  OPAL_INFO_LVL_9,
 278                                  MCA_BASE_VAR_SCOPE_READONLY,
 279                                  &cuda_event_max);
 280 
 281     /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
 282     mca_common_cuda_cumemcpy_async = 1;
 283     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async",
 284                                  "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize",
 285                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 286                                  OPAL_INFO_LVL_5,
 287                                  MCA_BASE_VAR_SCOPE_READONLY,
 288                                  &mca_common_cuda_cumemcpy_async);
 289 
 290 #if OPAL_ENABLE_DEBUG
 291     /* Use this flag to dump out timing of cumempcy sync and async */
 292     mca_common_cuda_cumemcpy_timing = 0;
 293     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
 294                                  "Set to 1 to dump timing of eager copies",
 295                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 296                                  OPAL_INFO_LVL_5,
 297                                  MCA_BASE_VAR_SCOPE_READONLY,
 298                                  &mca_common_cuda_cumemcpy_timing);
 299 #endif /* OPAL_ENABLE_DEBUG */
 300 
 301     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
 302                                  "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
 303                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 304                                  OPAL_INFO_LVL_9,
 305                                  MCA_BASE_VAR_SCOPE_READONLY,
 306                                  &mca_common_cuda_gpu_mem_check_workaround);
 307 }
 308 
 309 /**
 310  * This is the first stage of initialization.  This function is called
 311  * explicitly by any BTLs that can support CUDA-aware. It is called during
 312  * the component open phase of initialization. This fuction will look for
 313  * the SONAME of the library which is libcuda.so.1. In most cases, this will
 314  * result in the library found.  However, there are some setups that require
 315  * the extra steps for searching. This function will then load the symbols
 316  * needed from the CUDA driver library. Any failure will result in this
 317  * initialization failing and status will be set showing that.
 318  */
 319 int mca_common_cuda_stage_one_init(void)
 320 {
 321     int retval, i, j;
 322     char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
 323     char *searchpaths[] = {"", "/usr/lib64", NULL};
 324     char **errmsgs = NULL;
 325     char *errmsg = NULL;
 326     int errsize;
 327     bool stage_one_init_passed = false;
 328 
 329     stage_one_init_ref_count++;
 330     if (stage_one_init_ref_count > 1) {
 331         opal_output_verbose(10, mca_common_cuda_output,
 332                             "CUDA: stage_one_init_ref_count is now %d, no need to init",
 333                             stage_one_init_ref_count);
 334         return OPAL_SUCCESS;
 335     }
 336 
 337     /* This is a no-op in most cases as the parameters were registered earlier */
 338     mca_common_cuda_register_mca_variables();
 339 
 340     OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
 341     OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
 342     OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
 343     OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
 344 
 345     mca_common_cuda_output = opal_output_open(NULL);
 346     opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
 347 
 348     opal_output_verbose(10, mca_common_cuda_output,
 349                         "CUDA: stage_one_init_ref_count is now %d, initializing",
 350                         stage_one_init_ref_count);
 351 
 352     /* First check if the support is enabled.  In the case that the user has
 353      * turned it off, we do not need to continue with any CUDA specific
 354      * initialization.  Do this after MCA parameter registration. */
 355     if (!opal_cuda_support) {
 356         return 1;
 357     }
 358 
 359     if (!OPAL_HAVE_DL_SUPPORT) {
 360         opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
 361         return 1;
 362     }
 363 
 364     /* Now walk through all the potential names libcuda and find one
 365      * that works.  If it does, all is good.  If not, print out all
 366      * the messages about why things failed.  This code was careful
 367      * to try and save away all error messages if the loading ultimately
 368      * failed to help with debugging.
 369      *
 370      * NOTE: On the first loop we just utilize the default loading
 371      * paths from the system.  For the second loop, set /usr/lib64 to
 372      * the search path and try again.  This is done to handle the case
 373      * where we have both 32 and 64 bit libcuda.so libraries
 374      * installed.  Even when running in 64-bit mode, the /usr/lib
 375      * directory is searched first and we may find a 32-bit
 376      * libcuda.so.1 library.  Loading of this library will fail as the
 377      * OPAL DL framework does not handle having the wrong ABI in the
 378      * search path (unlike ld or ld.so).  Note that we only set this
 379      * search path after the original search.  This is so that
 380      * LD_LIBRARY_PATH and run path settings are respected.  Setting
 381      * this search path overrides them (rather then being
 382      * appended). */
 383     j = 0;
 384     while (searchpaths[j] != NULL) {
 385         i = 0;
 386         while (cudalibs[i] != NULL) {
 387             char *filename = NULL;
 388             char *str = NULL;
 389 
 390             /* If there's a non-empty search path, prepend it
 391                to the library filename */
 392             if (strlen(searchpaths[j]) > 0) {
 393                 opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
 394             } else {
 395                 filename = strdup(cudalibs[i]);
 396             }
 397             if (NULL == filename) {
 398                 opal_show_help("help-mpi-common-cuda.txt", "No memory",
 399                                true, OPAL_PROC_MY_HOSTNAME);
 400                 return 1;
 401             }
 402 
 403             retval = opal_dl_open(filename, false, false,
 404                                   &libcuda_handle, &str);
 405             if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
 406                 if (NULL != str) {
 407                     opal_argv_append(&errsize, &errmsgs, str);
 408                 } else {
 409                     opal_argv_append(&errsize, &errmsgs,
 410                                      "opal_dl_open() returned NULL.");
 411                 }
 412                 opal_output_verbose(10, mca_common_cuda_output,
 413                                     "CUDA: Library open error: %s",
 414                                     errmsgs[errsize-1]);
 415             } else {
 416                 opal_output_verbose(10, mca_common_cuda_output,
 417                                     "CUDA: Library successfully opened %s",
 418                                     cudalibs[i]);
 419                 stage_one_init_passed = true;
 420                 break;
 421             }
 422             i++;
 423 
 424             free(filename);
 425         }
 426         if (true == stage_one_init_passed) {
 427             break; /* Break out of outer loop */
 428         }
 429         j++;
 430     }
 431 
 432     if (true != stage_one_init_passed) {
 433         errmsg = opal_argv_join(errmsgs, '\n');
 434         if (opal_warn_on_missing_libcuda) {
 435             opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
 436                            errmsg);
 437         }
 438         opal_cuda_support = 0;
 439     }
 440     opal_argv_free(errmsgs);
 441     free(errmsg);
 442 
 443     if (true != stage_one_init_passed) {
 444         return 1;
 445     }
 446     opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init);
 447     OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t);
 448 
 449     /* Map in the functions that we need.  Note that if there is an error
 450      * the macro OPAL_CUDA_DLSYM will print an error and call return.  */
 451     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
 452     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
 453     OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
 454     OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
 455     OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
 456     OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
 457     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
 458     OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
 459     OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
 460     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
 461     OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
 462     OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
 463     OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
 464     OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
 465     OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
 466     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
 467     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
 468     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
 469     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
 470     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
 471     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
 472     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
 473     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
 474 #if OPAL_CUDA_GDR_SUPPORT
 475     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
 476 #endif /* OPAL_CUDA_GDR_SUPPORT */
 477     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
 478     OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
 479     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
 480     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
 481 #if OPAL_CUDA_GET_ATTRIBUTES
 482     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
 483 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
 484     return 0;
 485 }
 486 
 487 /**
 488  * This function is registered with the OPAL CUDA support.  In that way,
 489  * these function pointers will be loaded into the OPAL CUDA code when
 490  * the first convertor is initialized.  This does not trigger any CUDA
 491  * specific initialization as this may just be a host buffer that is
 492  * triggering this call.
 493  */
 494 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable)
 495 {
 496     if (OPAL_UNLIKELY(!opal_cuda_support)) {
 497         return OPAL_ERROR;
 498     }
 499 
 500     ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer;
 501     ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
 502     ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
 503     ftable->gpu_memmove = &mca_common_cuda_memmove;
 504 
 505     opal_output_verbose(30, mca_common_cuda_output,
 506                         "CUDA: support functions initialized");
 507     return OPAL_SUCCESS;
 508 }
 509 
 510 /**
 511  * This is the last phase of initialization.  This is triggered when we examine
 512  * a buffer pointer and determine it is a GPU buffer.  We then assume the user
 513  * has selected their GPU and we can go ahead with all the CUDA related
 514  * initializations.  If we get an error, just return.  Cleanup of resources
 515  * will happen when fini is called.
 516  */
 517 static int mca_common_cuda_stage_three_init(void)
 518 {
 519     int i, s, rc;
 520     CUresult res;
 521     CUcontext cuContext;
 522     common_cuda_mem_regs_t *mem_reg;
 523 
 524     OPAL_THREAD_LOCK(&common_cuda_init_lock);
 525     opal_output_verbose(20, mca_common_cuda_output,
 526                         "CUDA: entering stage three init");
 527 
 528 /* Compiled without support or user disabled support */
 529     if (OPAL_UNLIKELY(!opal_cuda_support)) {
 530         opal_output_verbose(20, mca_common_cuda_output,
 531                             "CUDA: No mpi cuda support, exiting stage three init");
 532         stage_three_init_complete = true;
 533         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 534         return OPAL_ERROR;
 535     }
 536 
 537     /* In case another thread snuck in and completed the initialization */
 538     if (true == stage_three_init_complete) {
 539         if (common_cuda_initialized) {
 540             opal_output_verbose(20, mca_common_cuda_output,
 541                                 "CUDA: Stage three already complete, exiting stage three init");
 542             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 543             return OPAL_SUCCESS;
 544         } else {
 545             opal_output_verbose(20, mca_common_cuda_output,
 546                                 "CUDA: Stage three already complete, failed during the init");
 547             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 548             return OPAL_ERROR;
 549         }
 550     }
 551 
 552     /* Check to see if this process is running in a CUDA context.  If
 553      * so, all is good.  If not, then disable registration of memory. */
 554     res = cuFunc.cuCtxGetCurrent(&cuContext);
 555     if (CUDA_SUCCESS != res) {
 556         if (mca_common_cuda_warning) {
 557             /* Check for the not initialized error since we can make suggestions to
 558              * user for this error. */
 559             if (CUDA_ERROR_NOT_INITIALIZED == res) {
 560                 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized",
 561                                true);
 562             } else {
 563                 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed",
 564                                true, res);
 565             }
 566         }
 567         mca_common_cuda_enabled = false;
 568         mca_common_cuda_register_memory = false;
 569     } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) {
 570         if (mca_common_cuda_warning) {
 571             opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL",
 572                            true);
 573         }
 574         mca_common_cuda_enabled = false;
 575         mca_common_cuda_register_memory = false;
 576     } else {
 577         /* All is good.  mca_common_cuda_register_memory will retain its original
 578          * value.  Normally, that is 1, but the user can override it to disable
 579          * registration of the internal buffers. */
 580         mca_common_cuda_enabled = true;
 581         opal_output_verbose(20, mca_common_cuda_output,
 582                             "CUDA: cuCtxGetCurrent succeeded");
 583     }
 584 
 585     /* No need to go on at this point.  If we cannot create a context and we are at
 586      * the point where we are making MPI calls, it is time to fully disable
 587      * CUDA support.
 588      */
 589     if (false == mca_common_cuda_enabled) {
 590         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 591         return OPAL_ERROR;
 592     }
 593 
 594     if (true == mca_common_cuda_enabled) {
 595         /* Set up an array to store outstanding IPC async copy events */
 596         cuda_event_ipc_num_used = 0;
 597         cuda_event_ipc_first_avail = 0;
 598         cuda_event_ipc_first_used = 0;
 599 
 600         cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
 601         if (NULL == cuda_event_ipc_array) {
 602             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 603                            true, OPAL_PROC_MY_HOSTNAME);
 604             rc = OPAL_ERROR;
 605             goto cleanup_and_error;
 606         }
 607 
 608         /* Create the events since they can be reused. */
 609         for (i = 0; i < cuda_event_max; i++) {
 610             res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
 611             if (CUDA_SUCCESS != res) {
 612                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
 613                                true, OPAL_PROC_MY_HOSTNAME, res);
 614                 rc = OPAL_ERROR;
 615                 goto cleanup_and_error;
 616             }
 617         }
 618 
 619         /* The first available status index is 0.  Make an empty frag
 620            array. */
 621         cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
 622             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
 623         if (NULL == cuda_event_ipc_frag_array) {
 624             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 625                            true, OPAL_PROC_MY_HOSTNAME);
 626             rc = OPAL_ERROR;
 627             goto cleanup_and_error;
 628         }
 629     }
 630 
 631     if (true == mca_common_cuda_enabled) {
 632         /* Set up an array to store outstanding async dtoh events.  Used on the
 633          * sending side for asynchronous copies. */
 634         cuda_event_dtoh_num_used = 0;
 635         cuda_event_dtoh_first_avail = 0;
 636         cuda_event_dtoh_first_used = 0;
 637 
 638         cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
 639         if (NULL == cuda_event_dtoh_array) {
 640             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 641                            true, OPAL_PROC_MY_HOSTNAME);
 642             rc = OPAL_ERROR;
 643             goto cleanup_and_error;
 644         }
 645 
 646         /* Create the events since they can be reused. */
 647         for (i = 0; i < cuda_event_max; i++) {
 648             res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
 649             if (CUDA_SUCCESS != res) {
 650                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
 651                                true, OPAL_PROC_MY_HOSTNAME, res);
 652                 rc = OPAL_ERROR;
 653                 goto cleanup_and_error;
 654             }
 655         }
 656 
 657         /* The first available status index is 0.  Make an empty frag
 658            array. */
 659         cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **)
 660             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
 661         if (NULL == cuda_event_dtoh_frag_array) {
 662             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 663                            true, OPAL_PROC_MY_HOSTNAME);
 664             rc = OPAL_ERROR;
 665             goto cleanup_and_error;
 666         }
 667 
 668         /* Set up an array to store outstanding async htod events.  Used on the
 669          * receiving side for asynchronous copies. */
 670         cuda_event_htod_num_used = 0;
 671         cuda_event_htod_first_avail = 0;
 672         cuda_event_htod_first_used = 0;
 673 
 674         cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
 675         if (NULL == cuda_event_htod_array) {
 676             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 677                            true, OPAL_PROC_MY_HOSTNAME);
 678            rc = OPAL_ERROR;
 679            goto cleanup_and_error;
 680         }
 681 
 682         /* Create the events since they can be reused. */
 683         for (i = 0; i < cuda_event_max; i++) {
 684             res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
 685             if (CUDA_SUCCESS != res) {
 686                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
 687                                true, OPAL_PROC_MY_HOSTNAME, res);
 688                rc = OPAL_ERROR;
 689                goto cleanup_and_error;
 690             }
 691         }
 692 
 693         /* The first available status index is 0.  Make an empty frag
 694            array. */
 695         cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **)
 696             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
 697         if (NULL == cuda_event_htod_frag_array) {
 698             opal_show_help("help-mpi-common-cuda.txt", "No memory",
 699                            true, OPAL_PROC_MY_HOSTNAME);
 700            rc = OPAL_ERROR;
 701            goto cleanup_and_error;
 702         }
 703     }
 704 
 705     s = opal_list_get_size(&common_cuda_memory_registrations);
 706     for(i = 0; i < s; i++) {
 707         mem_reg = (common_cuda_mem_regs_t *)
 708             opal_list_remove_first(&common_cuda_memory_registrations);
 709         if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
 710             res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
 711             if (res != CUDA_SUCCESS) {
 712                 /* If registering the memory fails, print a message and continue.
 713                  * This is not a fatal error. */
 714                 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
 715                                true, mem_reg->ptr, mem_reg->amount,
 716                                OPAL_PROC_MY_HOSTNAME, res, mem_reg->msg);
 717             } else {
 718                 opal_output_verbose(20, mca_common_cuda_output,
 719                                     "CUDA: cuMemHostRegister OK on rcache %s: "
 720                                     "address=%p, bufsize=%d",
 721                                     mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount);
 722             }
 723         }
 724         free(mem_reg->msg);
 725         OBJ_RELEASE(mem_reg);
 726     }
 727 
 728     /* Create stream for use in ipc asynchronous copies */
 729     res = cuFunc.cuStreamCreate(&ipcStream, 0);
 730     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 731         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
 732                        true, OPAL_PROC_MY_HOSTNAME, res);
 733         rc = OPAL_ERROR;
 734         goto cleanup_and_error;
 735     }
 736 
 737     /* Create stream for use in dtoh asynchronous copies */
 738     res = cuFunc.cuStreamCreate(&dtohStream, 0);
 739     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 740         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
 741                        true, OPAL_PROC_MY_HOSTNAME, res);
 742         rc = OPAL_ERROR;
 743         goto cleanup_and_error;
 744     }
 745 
 746     /* Create stream for use in htod asynchronous copies */
 747     res = cuFunc.cuStreamCreate(&htodStream, 0);
 748     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 749         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
 750                        true, OPAL_PROC_MY_HOSTNAME, res);
 751         rc = OPAL_ERROR;
 752         goto cleanup_and_error;
 753     }
 754 
 755     if (mca_common_cuda_cumemcpy_async) {
 756         /* Create stream for use in cuMemcpyAsync synchronous copies */
 757         res = cuFunc.cuStreamCreate(&memcpyStream, 0);
 758         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 759             opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
 760                            true, OPAL_PROC_MY_HOSTNAME, res);
 761             rc = OPAL_ERROR;
 762             goto cleanup_and_error;
 763         }
 764     }
 765 
 766     res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
 767     if (res != CUDA_SUCCESS) {
 768         /* If registering the memory fails, print a message and continue.
 769          * This is not a fatal error. */
 770         opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
 771                        true, &checkmem, sizeof(int),
 772                        OPAL_PROC_MY_HOSTNAME, res, "checkmem");
 773 
 774     } else {
 775         opal_output_verbose(20, mca_common_cuda_output,
 776                             "CUDA: cuMemHostRegister OK on test region");
 777     }
 778 
 779     opal_output_verbose(20, mca_common_cuda_output,
 780                         "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
 781 
 782     opal_output_verbose(30, mca_common_cuda_output,
 783                         "CUDA: initialized");
 784     opal_atomic_mb();  /* Make sure next statement does not get reordered */
 785     common_cuda_initialized = true;
 786     stage_three_init_complete = true;
 787     OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 788     return OPAL_SUCCESS;
 789 
 790     /* If we are here, something went wrong.  Cleanup and return an error. */
 791  cleanup_and_error:
 792     opal_atomic_mb(); /* Make sure next statement does not get reordered */
 793     stage_three_init_complete = true;
 794     OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 795     return rc;
 796 }
 797 
 798 /**
 799  * Cleanup all CUDA resources.
 800  *
 801  * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
 802  * rcache.  Looks like with the memory pool from openib (grdma), the unregistering is
 803  * called as the free list is destructed.  Not true for the sm mpool.  This means we
 804  * are currently still leaking some host memory we registered with CUDA.
 805  */
 806 void mca_common_cuda_fini(void)
 807 {
 808     int i;
 809     CUresult res;
 810 
 811     if (false == common_cuda_initialized) {
 812         stage_one_init_ref_count--;
 813         opal_output_verbose(20, mca_common_cuda_output,
 814                             "CUDA: mca_common_cuda_fini, never completed initialization so "
 815                             "skipping fini, ref_count is now %d", stage_one_init_ref_count);
 816         return;
 817     }
 818 
 819     if (0 == stage_one_init_ref_count) {
 820         opal_output_verbose(20, mca_common_cuda_output,
 821                             "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
 822                             stage_one_init_ref_count);
 823         return;
 824     }
 825 
 826     if (1 == stage_one_init_ref_count) {
 827         opal_output_verbose(20, mca_common_cuda_output,
 828                             "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
 829                             stage_one_init_ref_count);
 830 
 831         /* This call is in here to make sure the context is still valid.
 832          * This was the one way of checking which did not cause problems
 833          * while calling into the CUDA library.  This check will detect if
 834          * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
 835          * then this call will fail and we skip cleaning up CUDA resources. */
 836         res = cuFunc.cuMemHostUnregister(&checkmem);
 837         if (CUDA_SUCCESS != res) {
 838             ctx_ok = 0;
 839         }
 840         opal_output_verbose(20, mca_common_cuda_output,
 841                             "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
 842                             res, ctx_ok);
 843 
 844         if (NULL != cuda_event_ipc_array) {
 845             if (ctx_ok) {
 846                 for (i = 0; i < cuda_event_max; i++) {
 847                     if (NULL != cuda_event_ipc_array[i]) {
 848                         cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
 849                     }
 850                 }
 851             }
 852             free(cuda_event_ipc_array);
 853         }
 854         if (NULL != cuda_event_htod_array) {
 855             if (ctx_ok) {
 856                 for (i = 0; i < cuda_event_max; i++) {
 857                     if (NULL != cuda_event_htod_array[i]) {
 858                         cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
 859                     }
 860                 }
 861             }
 862             free(cuda_event_htod_array);
 863         }
 864 
 865         if (NULL != cuda_event_dtoh_array) {
 866             if (ctx_ok) {
 867                 for (i = 0; i < cuda_event_max; i++) {
 868                     if (NULL != cuda_event_dtoh_array[i]) {
 869                         cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
 870                     }
 871                 }
 872             }
 873             free(cuda_event_dtoh_array);
 874         }
 875 
 876         if (NULL != cuda_event_ipc_frag_array) {
 877             free(cuda_event_ipc_frag_array);
 878         }
 879         if (NULL != cuda_event_htod_frag_array) {
 880             free(cuda_event_htod_frag_array);
 881         }
 882         if (NULL != cuda_event_dtoh_frag_array) {
 883             free(cuda_event_dtoh_frag_array);
 884         }
 885         if ((NULL != ipcStream) && ctx_ok) {
 886             cuFunc.cuStreamDestroy(ipcStream);
 887         }
 888         if ((NULL != dtohStream) && ctx_ok) {
 889             cuFunc.cuStreamDestroy(dtohStream);
 890         }
 891         if ((NULL != htodStream) && ctx_ok) {
 892             cuFunc.cuStreamDestroy(htodStream);
 893         }
 894         if ((NULL != memcpyStream) && ctx_ok) {
 895             cuFunc.cuStreamDestroy(memcpyStream);
 896         }
 897         OBJ_DESTRUCT(&common_cuda_init_lock);
 898         OBJ_DESTRUCT(&common_cuda_htod_lock);
 899         OBJ_DESTRUCT(&common_cuda_dtoh_lock);
 900         OBJ_DESTRUCT(&common_cuda_ipc_lock);
 901         if (NULL != libcuda_handle) {
 902             opal_dl_close(libcuda_handle);
 903         }
 904 
 905         opal_output_verbose(20, mca_common_cuda_output,
 906                             "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
 907                             stage_one_init_ref_count);
 908 
 909         opal_output_close(mca_common_cuda_output);
 910 
 911     } else {
 912         opal_output_verbose(20, mca_common_cuda_output,
 913                             "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
 914                             stage_one_init_ref_count);
 915     }
 916     stage_one_init_ref_count--;
 917 }
 918 
 919 /**
 920  * Call the CUDA register function so we pin the memory in the CUDA
 921  * space.
 922  */
 923 void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
 924     int res;
 925 
 926     /* Always first check if the support is enabled.  If not, just return */
 927     if (!opal_cuda_support)
 928         return;
 929 
 930     if (!common_cuda_initialized) {
 931         OPAL_THREAD_LOCK(&common_cuda_init_lock);
 932         if (!common_cuda_initialized) {
 933             common_cuda_mem_regs_t *regptr;
 934             regptr = OBJ_NEW(common_cuda_mem_regs_t);
 935             regptr->ptr = ptr;
 936             regptr->amount = amount;
 937             regptr->msg = strdup(msg);
 938             opal_list_append(&common_cuda_memory_registrations,
 939                              (opal_list_item_t*)regptr);
 940             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 941             return;
 942         }
 943         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
 944     }
 945 
 946     if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
 947         res = cuFunc.cuMemHostRegister(ptr, amount, 0);
 948         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 949             /* If registering the memory fails, print a message and continue.
 950              * This is not a fatal error. */
 951             opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed",
 952                            true, ptr, amount,
 953                            OPAL_PROC_MY_HOSTNAME, res, msg);
 954         } else {
 955             opal_output_verbose(20, mca_common_cuda_output,
 956                                 "CUDA: cuMemHostRegister OK on rcache %s: "
 957                                 "address=%p, bufsize=%d",
 958                                 msg, ptr, (int)amount);
 959         }
 960     }
 961 }
 962 
 963 /**
 964  * Call the CUDA unregister function so we unpin the memory in the CUDA
 965  * space.
 966  */
 967 void mca_common_cuda_unregister(void *ptr, char *msg) {
 968     int res, i, s;
 969     common_cuda_mem_regs_t *mem_reg;
 970 
 971     /* This can happen if memory was queued up to be registered, but
 972      * no CUDA operations happened, so it never was registered.
 973      * Therefore, just release any of the resources. */
 974     if (!common_cuda_initialized) {
 975         s = opal_list_get_size(&common_cuda_memory_registrations);
 976         for(i = 0; i < s; i++) {
 977             mem_reg = (common_cuda_mem_regs_t *)
 978                 opal_list_remove_first(&common_cuda_memory_registrations);
 979             free(mem_reg->msg);
 980             OBJ_RELEASE(mem_reg);
 981         }
 982         return;
 983     }
 984 
 985     if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
 986         res = cuFunc.cuMemHostUnregister(ptr);
 987         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
 988             /* If unregistering the memory fails, just continue.  This is during
 989              * shutdown.  Only print when running in verbose mode. */
 990             opal_output_verbose(20, mca_common_cuda_output,
 991                                 "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s",
 992                                 ptr, res, msg);
 993 
 994         } else {
 995             opal_output_verbose(20, mca_common_cuda_output,
 996                                 "CUDA: cuMemHostUnregister OK on rcache %s: "
 997                                 "address=%p",
 998                                 msg, ptr);
 999         }
1000     }
1001 }
1002 
1003 /*
1004  * Get the memory handle of a local section of memory that can be sent
1005  * to the remote size so it can access the memory.  This is the
1006  * registration function for the sending side of a message transfer.
1007  */
1008 int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1009                       mca_rcache_base_registration_t *hdrreg)
1010 
1011 {
1012     CUmemorytype memType;
1013     CUresult result;
1014     CUipcMemHandle *memHandle;
1015     CUdeviceptr pbase;
1016     size_t psize;
1017 
1018     mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)newreg;
1019     memHandle = (CUipcMemHandle *)cuda_reg->data.memHandle;
1020 
1021     /* We should only be there if this is a CUDA device pointer */
1022     result = cuFunc.cuPointerGetAttribute(&memType,
1023                                           CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
1024     assert(CUDA_SUCCESS == result);
1025     assert(CU_MEMORYTYPE_DEVICE == memType);
1026 
1027     /* Get the memory handle so we can send it to the remote process. */
1028     result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr)base);
1029     CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After"));
1030 
1031     if (CUDA_SUCCESS != result) {
1032         opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed",
1033                        true, result, base);
1034         return OPAL_ERROR;
1035     } else {
1036         opal_output_verbose(20, mca_common_cuda_output,
1037                             "CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
1038                             base, (int)size);
1039     }
1040 
1041     /* Need to get the real base and size of the memory handle.  This is
1042      * how the remote side saves the handles in a cache. */
1043     result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
1044     if (CUDA_SUCCESS != result) {
1045         opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
1046                        true, result, base);
1047         return OPAL_ERROR;
1048     } else {
1049         opal_output_verbose(10, mca_common_cuda_output,
1050                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
1051                             base, (int)size, (void *)pbase, (int)psize);
1052     }
1053 
1054     /* Store all the information in the registration */
1055     cuda_reg->base.base = (void *)pbase;
1056     cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
1057     cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
1058     cuda_reg->data.memh_seg_len = psize;
1059 
1060 #if OPAL_CUDA_SYNC_MEMOPS
1061     /* With CUDA 6.0, we can set an attribute on the memory pointer that will
1062      * ensure any synchronous copies are completed prior to any other access
1063      * of the memory region.  This means we do not need to record an event
1064      * and send to the remote side.
1065      */
1066     memType = 1; /* Just use this variable since we already have it */
1067     result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
1068                                           (CUdeviceptr)base);
1069     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1070         opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
1071                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1072         return OPAL_ERROR;
1073     }
1074 #else
1075     /* Need to record the event to ensure that any memcopies into the
1076      * device memory have completed.  The event handle associated with
1077      * this event is sent to the remote process so that it will wait
1078      * on this event prior to copying data out of the device memory.
1079      * Note that this needs to be the NULL stream to make since it is
1080      * unknown what stream any copies into the device memory were done
1081      * with. */
1082     result = cuFunc.cuEventRecord((CUevent)cuda_reg->data.event, 0);
1083     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1084         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1085                        true, result, base);
1086         return OPAL_ERROR;
1087     }
1088 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1089 
1090     return OPAL_SUCCESS;
1091 }
1092 
1093 /*
1094  * This function is called by the local side that called the cuda_getmemhandle.
1095  * There is nothing to be done so just return.
1096  */
1097 int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1098 {
1099     opal_output_verbose(10, mca_common_cuda_output,
1100                         "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
1101     CUDA_DUMP_MEMHANDLE((100, ((mca_rcache_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle"));
1102 
1103     return OPAL_SUCCESS;
1104 }
1105 
1106 /*
1107  * Open a memory handle that refers to remote memory so we can get an address
1108  * that works on the local side.  This is the registration function for the
1109  * remote side of a transfer.  newreg contains the new handle.  hddrreg contains
1110  * the memory handle that was received from the remote side.
1111  */
1112 int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1113                        mca_rcache_base_registration_t *hdrreg)
1114 {
1115     CUresult result;
1116     CUipcMemHandle *memHandle;
1117     mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t*)newreg;
1118 
1119     /* Save in local variable to avoid ugly casting */
1120     memHandle = (CUipcMemHandle *)cuda_newreg->data.memHandle;
1121     CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle"));
1122 
1123     /* Open the memory handle and store it into the registration structure. */
1124     result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, *memHandle,
1125                                        CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
1126 
1127     /* If there are some stale entries in the cache, they can cause other
1128      * registrations to fail.  Let the caller know that so that can attempt
1129      * to clear them out. */
1130     if (CUDA_ERROR_ALREADY_MAPPED == result) {
1131         opal_output_verbose(10, mca_common_cuda_output,
1132                             "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
1133                             "p=%p,size=%d: notify memory pool\n", base, (int)size);
1134         return OPAL_ERR_WOULD_BLOCK;
1135     }
1136     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1137         opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed",
1138                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1139         /* Currently, this is a non-recoverable error */
1140         return OPAL_ERROR;
1141     } else {
1142         opal_output_verbose(10, mca_common_cuda_output,
1143                             "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
1144                             newreg->alloc_base, base, (int)size);
1145         CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle"));
1146     }
1147 
1148     return OPAL_SUCCESS;
1149 }
1150 
1151 /*
1152  * Close a memory handle that refers to remote memory.
1153  */
1154 int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1155 {
1156     CUresult result;
1157     mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)reg;
1158 
1159     /* Only attempt to close if we have valid context.  This can change if a call
1160      * to the fini function is made and we discover context is gone. */
1161     if (ctx_ok) {
1162         result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
1163         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1164             if (CUDA_ERROR_DEINITIALIZED != result) {
1165                 opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1166                 true, result, cuda_reg->base.alloc_base);
1167             }
1168             /* We will just continue on and hope things continue to work. */
1169         } else {
1170             opal_output_verbose(10, mca_common_cuda_output,
1171                                 "CUDA: cuIpcCloseMemHandle passed: base=%p",
1172                                 cuda_reg->base.alloc_base);
1173             CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
1174         }
1175     }
1176 
1177     return OPAL_SUCCESS;
1178 }
1179 
1180 void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle)
1181 {
1182     CUresult result;
1183 
1184     result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
1185     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1186         opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
1187                        true, OPAL_PROC_MY_HOSTNAME, result);
1188     }
1189 
1190     result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
1191     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1192         opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
1193                        true, result);
1194     }
1195 
1196     CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
1197 
1198 }
1199 
1200 void mca_common_cuda_destruct_event(uintptr_t event)
1201 {
1202     CUresult result;
1203 
1204     /* Only attempt to destroy if we have valid context.  This can change if a call
1205      * to the fini function is made and we discover context is gone. */
1206     if (ctx_ok) {
1207         result = cuFunc.cuEventDestroy((CUevent)event);
1208         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1209             opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1210                            true, result);
1211         }
1212     }
1213 }
1214 
1215 
1216 /*
1217  * Put remote event on stream to ensure that the the start of the
1218  * copy does not start until the completion of the event.
1219  */
1220 void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg)
1221 {
1222 #if OPAL_CUDA_SYNC_MEMOPS
1223     /* No need for any of this with SYNC_MEMOPS feature */
1224     return;
1225 #else /* OPAL_CUDA_SYNC_MEMOPS */
1226     CUipcEventHandle evtHandle;
1227     CUevent event;
1228     CUresult result;
1229 
1230     memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1231     CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
1232 
1233     result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
1234     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1235         opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
1236                        true, result);
1237     }
1238 
1239     /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1240      * versions.  Need to record an event on the stream, even though
1241      * it is not used, to make sure we do not short circuit our way
1242      * out of the cuStreamWaitEvent test.
1243      */
1244     result = cuFunc.cuEventRecord(event, 0);
1245     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1246         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1247                        true, OPAL_PROC_MY_HOSTNAME, result);
1248     }
1249     /* END of Workaround */
1250 
1251     result = cuFunc.cuStreamWaitEvent(0, event, 0);
1252     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1253         opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
1254                        true, result);
1255     }
1256 
1257     /* All done with this event. */
1258     result = cuFunc.cuEventDestroy(event);
1259     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1260         opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1261                        true, result);
1262     }
1263 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1264 }
1265 
1266 /*
1267  * Start the asynchronous copy.  Then record and save away an event that will
1268  * be queried to indicate the copy has completed.
1269  */
1270 int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1271                            struct mca_btl_base_descriptor_t *frag, int *done)
1272 {
1273     CUresult result;
1274     int iter;
1275 
1276     OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1277     /* First make sure there is room to store the event.  If not, then
1278      * return an error.  The error message will tell the user to try and
1279      * run again, but with a larger array for storing events. */
1280     if (cuda_event_ipc_num_used == cuda_event_max) {
1281         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1282                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1283         OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1284         return OPAL_ERR_OUT_OF_RESOURCE;
1285     }
1286 
1287     if (cuda_event_ipc_num_used > cuda_event_ipc_most) {
1288         cuda_event_ipc_most = cuda_event_ipc_num_used;
1289         /* Just print multiples of 10 */
1290         if (0 == (cuda_event_ipc_most % 10)) {
1291             opal_output_verbose(20, mca_common_cuda_output,
1292                                 "Maximum ipc events used is now %d", cuda_event_ipc_most);
1293         }
1294     }
1295 
1296     /* This is the standard way to run.  Running with synchronous copies is available
1297      * to measure the advantages of asynchronous copies. */
1298     if (OPAL_LIKELY(mca_common_cuda_async)) {
1299         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1300         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1301             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1302                            true, dst, src, amount, result);
1303             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1304             return OPAL_ERROR;
1305         } else {
1306             opal_output_verbose(20, mca_common_cuda_output,
1307                                 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1308                                 dst, src, (int)amount);
1309         }
1310         result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1311         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1312             opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1313                            true, OPAL_PROC_MY_HOSTNAME, result);
1314             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1315             return OPAL_ERROR;
1316         }
1317         cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1318 
1319         /* Bump up the first available slot and number used by 1 */
1320         cuda_event_ipc_first_avail++;
1321         if (cuda_event_ipc_first_avail >= cuda_event_max) {
1322             cuda_event_ipc_first_avail = 0;
1323         }
1324         cuda_event_ipc_num_used++;
1325 
1326         *done = 0;
1327     } else {
1328         /* Mimic the async function so they use the same memcpy call. */
1329         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1330         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1331             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1332                            true, dst, src, amount, result);
1333             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1334             return OPAL_ERROR;
1335         } else {
1336             opal_output_verbose(20, mca_common_cuda_output,
1337                                 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1338                                 dst, src, (int)amount);
1339         }
1340 
1341         /* Record an event, then wait for it to complete with calls to cuEventQuery */
1342         result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1343         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1344             opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1345                            true, OPAL_PROC_MY_HOSTNAME, result);
1346             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1347             return OPAL_ERROR;
1348         }
1349 
1350         cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1351 
1352         /* Bump up the first available slot and number used by 1 */
1353         cuda_event_ipc_first_avail++;
1354         if (cuda_event_ipc_first_avail >= cuda_event_max) {
1355             cuda_event_ipc_first_avail = 0;
1356         }
1357         cuda_event_ipc_num_used++;
1358 
1359         result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1360         if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1361             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1362                            true, result);
1363             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1364             return OPAL_ERROR;
1365         }
1366 
1367         iter = 0;
1368         while (CUDA_ERROR_NOT_READY == result) {
1369             if (0 == (iter % 10)) {
1370                 opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
1371             }
1372             result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1373             if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1374                 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1375                                true, result);
1376             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1377                 return OPAL_ERROR;
1378             }
1379             iter++;
1380         }
1381 
1382         --cuda_event_ipc_num_used;
1383         ++cuda_event_ipc_first_used;
1384         if (cuda_event_ipc_first_used >= cuda_event_max) {
1385             cuda_event_ipc_first_used = 0;
1386         }
1387         *done = 1;
1388     }
1389     OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1390     return OPAL_SUCCESS;
1391 }
1392 
1393 /*
1394  * Record an event and save the frag.  This is called by the sending side and
1395  * is used to queue an event when a htod copy has been initiated.
1396  */
1397 int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1398 {
1399     CUresult result;
1400 
1401     /* First make sure there is room to store the event.  If not, then
1402      * return an error.  The error message will tell the user to try and
1403      * run again, but with a larger array for storing events. */
1404     OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1405     if (cuda_event_dtoh_num_used == cuda_event_max) {
1406         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1407                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1408         return OPAL_ERR_OUT_OF_RESOURCE;
1409     }
1410 
1411     if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) {
1412         cuda_event_dtoh_most = cuda_event_dtoh_num_used;
1413         /* Just print multiples of 10 */
1414         if (0 == (cuda_event_dtoh_most % 10)) {
1415             opal_output_verbose(20, mca_common_cuda_output,
1416                                 "Maximum DtoH events used is now %d", cuda_event_dtoh_most);
1417         }
1418     }
1419 
1420     result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
1421     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1422         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1423                        true, OPAL_PROC_MY_HOSTNAME, result);
1424         OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1425         return OPAL_ERROR;
1426     }
1427     cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
1428 
1429     /* Bump up the first available slot and number used by 1 */
1430     cuda_event_dtoh_first_avail++;
1431     if (cuda_event_dtoh_first_avail >= cuda_event_max) {
1432         cuda_event_dtoh_first_avail = 0;
1433     }
1434     cuda_event_dtoh_num_used++;
1435 
1436     OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1437     return OPAL_SUCCESS;
1438 }
1439 
1440 /*
1441  * Record an event and save the frag.  This is called by the receiving side and
1442  * is used to queue an event when a dtoh copy has been initiated.
1443  */
1444 int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1445 {
1446     CUresult result;
1447 
1448     OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1449     /* First make sure there is room to store the event.  If not, then
1450      * return an error.  The error message will tell the user to try and
1451      * run again, but with a larger array for storing events. */
1452     if (cuda_event_htod_num_used == cuda_event_max) {
1453         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1454                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1455         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1456         return OPAL_ERR_OUT_OF_RESOURCE;
1457     }
1458 
1459     if (cuda_event_htod_num_used > cuda_event_htod_most) {
1460         cuda_event_htod_most = cuda_event_htod_num_used;
1461         /* Just print multiples of 10 */
1462         if (0 == (cuda_event_htod_most % 10)) {
1463             opal_output_verbose(20, mca_common_cuda_output,
1464                                 "Maximum HtoD events used is now %d", cuda_event_htod_most);
1465         }
1466     }
1467 
1468     result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
1469     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1470         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1471                        true, OPAL_PROC_MY_HOSTNAME, result);
1472         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1473         return OPAL_ERROR;
1474     }
1475     cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
1476 
1477    /* Bump up the first available slot and number used by 1 */
1478     cuda_event_htod_first_avail++;
1479     if (cuda_event_htod_first_avail >= cuda_event_max) {
1480         cuda_event_htod_first_avail = 0;
1481     }
1482     cuda_event_htod_num_used++;
1483 
1484     OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1485     return OPAL_SUCCESS;
1486 }
1487 
1488 /**
1489  * Used to get the dtoh stream for initiating asynchronous copies.
1490  */
1491 void *mca_common_cuda_get_dtoh_stream(void) {
1492     return (void *)dtohStream;
1493 }
1494 
1495 /**
1496  * Used to get the htod stream for initiating asynchronous copies.
1497  */
1498 void *mca_common_cuda_get_htod_stream(void) {
1499     return (void *)htodStream;
1500 }
1501 
1502 /*
1503  * Function is called every time progress is called with the sm BTL.  If there
1504  * are outstanding events, check to see if one has completed.  If so, hand
1505  * back the fragment for further processing.
1506  */
1507 int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
1508     CUresult result;
1509 
1510     OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1511     if (cuda_event_ipc_num_used > 0) {
1512         opal_output_verbose(20, mca_common_cuda_output,
1513                            "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
1514                             cuda_event_ipc_num_used);
1515 
1516         result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1517 
1518         /* We found an event that is not ready, so return. */
1519         if (CUDA_ERROR_NOT_READY == result) {
1520             opal_output_verbose(20, mca_common_cuda_output,
1521                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1522             *frag = NULL;
1523             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1524             return 0;
1525         } else if (CUDA_SUCCESS != result) {
1526             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1527                            true, result);
1528             *frag = NULL;
1529             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1530             return OPAL_ERROR;
1531         }
1532 
1533         *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
1534         opal_output_verbose(10, mca_common_cuda_output,
1535                             "CUDA: cuEventQuery returned %d", result);
1536 
1537         /* Bump counters, loop around the circular buffer if necessary */
1538         --cuda_event_ipc_num_used;
1539         ++cuda_event_ipc_first_used;
1540         if (cuda_event_ipc_first_used >= cuda_event_max) {
1541             cuda_event_ipc_first_used = 0;
1542         }
1543         /* A return value of 1 indicates an event completed and a frag was returned */
1544         OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1545         return 1;
1546     }
1547     OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1548     return 0;
1549 }
1550 
1551 /**
1552  * Progress any dtoh event completions.
1553  */
1554 int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
1555     CUresult result;
1556 
1557     OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1558     if (cuda_event_dtoh_num_used > 0) {
1559         opal_output_verbose(30, mca_common_cuda_output,
1560                            "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
1561                             cuda_event_dtoh_num_used);
1562 
1563         result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
1564 
1565         /* We found an event that is not ready, so return. */
1566         if (CUDA_ERROR_NOT_READY == result) {
1567             opal_output_verbose(30, mca_common_cuda_output,
1568                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1569             *frag = NULL;
1570             OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1571             return 0;
1572         } else if (CUDA_SUCCESS != result) {
1573             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1574                            true, result);
1575             *frag = NULL;
1576             OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1577             return OPAL_ERROR;
1578         }
1579 
1580         *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
1581         opal_output_verbose(30, mca_common_cuda_output,
1582                             "CUDA: cuEventQuery returned %d", result);
1583 
1584         /* Bump counters, loop around the circular buffer if necessary */
1585         --cuda_event_dtoh_num_used;
1586         ++cuda_event_dtoh_first_used;
1587         if (cuda_event_dtoh_first_used >= cuda_event_max) {
1588             cuda_event_dtoh_first_used = 0;
1589         }
1590         /* A return value of 1 indicates an event completed and a frag was returned */
1591         OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1592         return 1;
1593     }
1594     OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1595     return 0;
1596 }
1597 
1598 /**
1599  * Progress any dtoh event completions.
1600  */
1601 int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
1602     CUresult result;
1603 
1604     OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1605     if (cuda_event_htod_num_used > 0) {
1606         opal_output_verbose(30, mca_common_cuda_output,
1607                            "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
1608                             cuda_event_htod_num_used);
1609 
1610         result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
1611 
1612         /* We found an event that is not ready, so return. */
1613         if (CUDA_ERROR_NOT_READY == result) {
1614             opal_output_verbose(30, mca_common_cuda_output,
1615                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1616             *frag = NULL;
1617             OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1618             return 0;
1619         } else if (CUDA_SUCCESS != result) {
1620             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1621                            true, result);
1622             *frag = NULL;
1623             OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1624             return OPAL_ERROR;
1625         }
1626 
1627         *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used];
1628         opal_output_verbose(30, mca_common_cuda_output,
1629                             "CUDA: cuEventQuery returned %d", result);
1630 
1631         /* Bump counters, loop around the circular buffer if necessary */
1632         --cuda_event_htod_num_used;
1633         ++cuda_event_htod_first_used;
1634         if (cuda_event_htod_first_used >= cuda_event_max) {
1635             cuda_event_htod_first_used = 0;
1636         }
1637         /* A return value of 1 indicates an event completed and a frag was returned */
1638         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1639         return 1;
1640     }
1641     OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1642     return OPAL_ERR_RESOURCE_BUSY;
1643 }
1644 
1645 
1646 /**
1647  * Need to make sure the handle we are retrieving from the cache is still
1648  * valid.  Compare the cached handle to the one received.
1649  */
1650 int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
1651                                       mca_rcache_common_cuda_reg_t *old_reg)
1652 {
1653 
1654     if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) {
1655         return 1;
1656     } else {
1657         return 0;
1658     }
1659 
1660 }
1661 
1662 /*
1663  * Function to dump memory handle information.  This is based on
1664  * definitions from cuiinterprocess_private.h.
1665  */
1666 static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) {
1667 
1668     struct InterprocessMemHandleInternal
1669     {
1670         /* The first two entries are the CUinterprocessCtxHandle */
1671         int64_t ctxId; /* unique (within a process) id of the sharing context */
1672         int     pid;   /* pid of sharing context */
1673 
1674         int64_t size;
1675         int64_t blocksize;
1676         int64_t offset;
1677         int     gpuId;
1678         int     subDeviceIndex;
1679         int64_t serial;
1680     } memH;
1681 
1682     if (NULL == str) {
1683         str = "CUDA";
1684     }
1685     memcpy(&memH, memHandle, sizeof(memH));
1686     opal_output_verbose(verbose, mca_common_cuda_output,
1687                         "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64 ", offset=%"
1688                         PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64,
1689                         str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset,
1690                         memH.gpuId, memH.subDeviceIndex, memH.serial);
1691 }
1692 
1693 /*
1694  * Function to dump memory handle information.  This is based on
1695  * definitions from cuiinterprocess_private.h.
1696  */
1697 static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
1698 
1699     struct InterprocessEventHandleInternal
1700     {
1701         unsigned long pid;
1702         unsigned long serial;
1703         int index;
1704     } evtH;
1705 
1706     if (NULL == str) {
1707         str = "CUDA";
1708     }
1709     memcpy(&evtH, evtHandle, sizeof(evtH));
1710     opal_output_verbose(verbose, mca_common_cuda_output,
1711                         "CUDA: %s:pid=%lu, serial=%lu, index=%d",
1712                         str, evtH.pid, evtH.serial, evtH.index);
1713 }
1714 
1715 
1716 /* Return microseconds of elapsed time. Microseconds are relevant when
1717  * trying to understand the fixed overhead of the communication. Used
1718  * when trying to time various functions.
1719  *
1720  * Cut and past the following to get timings where wanted.
1721  *
1722  *   clock_gettime(CLOCK_MONOTONIC, &ts_start);
1723  *   FUNCTION OF INTEREST
1724  *   clock_gettime(CLOCK_MONOTONIC, &ts_end);
1725  *   accum = mydifftime(ts_start, ts_end);
1726  *   opal_output(0, "Function took   %7.2f usecs\n", accum);
1727  *
1728  */
1729 #if OPAL_ENABLE_DEBUG
1730 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
1731     return (ts_end - ts_start);
1732 }
1733 #endif /* OPAL_ENABLE_DEBUG */
1734 
1735 /* Routines that get plugged into the opal datatype code */
1736 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
1737 {
1738     int res;
1739     CUmemorytype memType = 0;
1740     CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
1741     CUcontext ctx = NULL, memCtx = NULL;
1742 #if OPAL_CUDA_GET_ATTRIBUTES
1743     uint32_t isManaged = 0;
1744     /* With CUDA 7.0, we can get multiple attributes with a single call */
1745     CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
1746                                          CU_POINTER_ATTRIBUTE_CONTEXT,
1747                                          CU_POINTER_ATTRIBUTE_IS_MANAGED};
1748     void *attrdata[] = {(void *)&memType, (void *)&memCtx, (void *)&isManaged};
1749 
1750     res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
1751     OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output,
1752                         "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d",
1753                          (void *)dbuf, (int)memType, (void *)memCtx, isManaged, res));
1754 
1755     /* Mark unified memory buffers with a flag.  This will allow all unified
1756      * memory to be forced through host buffers.  Note that this memory can
1757      * be either host or device so we need to set this flag prior to that check. */
1758     if (1 == isManaged) {
1759         if (NULL != convertor) {
1760             convertor->flags |= CONVERTOR_CUDA_UNIFIED;
1761         }
1762     }
1763     if (res != CUDA_SUCCESS) {
1764         /* If we cannot determine it is device pointer,
1765          * just assume it is not. */
1766         return 0;
1767     } else if (memType == CU_MEMORYTYPE_HOST) {
1768         /* Host memory, nothing to do here */
1769         return 0;
1770     } else if (memType == 0) {
1771         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
1772         return 0;
1773     }
1774     /* Must be a device pointer */
1775     assert(memType == CU_MEMORYTYPE_DEVICE);
1776 #else /* OPAL_CUDA_GET_ATTRIBUTES */
1777     res = cuFunc.cuPointerGetAttribute(&memType,
1778                                        CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
1779     if (res != CUDA_SUCCESS) {
1780         /* If we cannot determine it is device pointer,
1781          * just assume it is not. */
1782         return 0;
1783     } else if (memType == CU_MEMORYTYPE_HOST) {
1784         /* Host memory, nothing to do here */
1785         return 0;
1786     }
1787     /* Must be a device pointer */
1788     assert(memType == CU_MEMORYTYPE_DEVICE);
1789 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1790 
1791     /* This piece of code was added in to handle in a case involving
1792      * OMP threads.  The user had initialized CUDA and then spawned
1793      * two threads.  The first thread had the CUDA context, but the
1794      * second thread did not.  We therefore had no context to act upon
1795      * and future CUDA driver calls would fail.  Therefore, if we have
1796      * GPU memory, but no context, get the context from the GPU memory
1797      * and set the current context to that.  It is rare that we will not
1798      * have a context. */
1799     res = cuFunc.cuCtxGetCurrent(&ctx);
1800     if (OPAL_UNLIKELY(NULL == ctx)) {
1801         if (CUDA_SUCCESS == res) {
1802 #if !OPAL_CUDA_GET_ATTRIBUTES
1803             res = cuFunc.cuPointerGetAttribute(&memCtx,
1804                                                CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
1805             if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1806                 opal_output(0, "CUDA: error calling cuPointerGetAttribute: "
1807                             "res=%d, ptr=%p aborting...", res, pUserBuf);
1808                 return OPAL_ERROR;
1809             }
1810 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1811             res = cuFunc.cuCtxSetCurrent(memCtx);
1812             if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1813                 opal_output(0, "CUDA: error calling cuCtxSetCurrent: "
1814                             "res=%d, ptr=%p aborting...", res, pUserBuf);
1815                 return OPAL_ERROR;
1816             } else {
1817                 OPAL_OUTPUT_VERBOSE((10, mca_common_cuda_output,
1818                                      "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf));
1819             }
1820         } else {
1821             /* Print error and proceed */
1822             opal_output(0, "CUDA: error calling cuCtxGetCurrent: "
1823                         "res=%d, ptr=%p aborting...", res, pUserBuf);
1824             return OPAL_ERROR;
1825         }
1826     }
1827 
1828     /* WORKAROUND - They are times when the above code determines a pice of memory
1829      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
1830      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
1831      * made it this far, then the assumption at this point is we have GPU memory.
1832      * Unfotunately, this extra call is costing us another 100 ns almost doubling
1833      * the cost of this entire function. */
1834     if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
1835         CUdeviceptr pbase;
1836         size_t psize;
1837         res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
1838         if (CUDA_SUCCESS != res) {
1839             opal_output_verbose(5, mca_common_cuda_output,
1840                                 "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
1841                                 "Overriding check and setting to host pointer. ",
1842                               res, (void *)dbuf);
1843             /* This cannot be GPU memory if the previous call failed */
1844             return 0;
1845         }
1846     }
1847 
1848     /* First access on a device pointer finalizes CUDA support initialization.
1849      * If initialization fails, disable support. */
1850     if (!stage_three_init_complete) {
1851         if (0 != mca_common_cuda_stage_three_init()) {
1852             opal_cuda_support = 0;
1853         }
1854     }
1855 
1856     return 1;
1857 }
1858 
1859 static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
1860                                          opal_convertor_t* convertor)
1861 {
1862     return cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
1863                                 (CUstream)convertor->stream);
1864 }
1865 
1866 /**
1867  * This function is plugged into various areas where a cuMemcpy would be called.
1868  * This is a synchronous operation that will not return until the copy is complete.
1869  */
1870 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
1871 {
1872     CUresult result;
1873 #if OPAL_ENABLE_DEBUG
1874     CUmemorytype memTypeSrc, memTypeDst;
1875     if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1876         /* Nice to know type of source and destination for timing output. Do
1877          * not care about return code as memory type will just be set to 0 */
1878         result = cuFunc.cuPointerGetAttribute(&memTypeDst,
1879                                               CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
1880         result = cuFunc.cuPointerGetAttribute(&memTypeSrc,
1881                                               CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
1882         ts_start = opal_timer_base_get_usec();
1883     }
1884 #endif
1885     if (mca_common_cuda_cumemcpy_async) {
1886         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream);
1887         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1888             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1889                            true, dest, src, size, result);
1890             return OPAL_ERROR;
1891         }
1892         result = cuFunc.cuStreamSynchronize(memcpyStream);
1893         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1894             opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1895                            true, OPAL_PROC_MY_HOSTNAME, result);
1896             return OPAL_ERROR;
1897         }
1898     } else {
1899          result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
1900          if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1901              opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
1902                             true, OPAL_PROC_MY_HOSTNAME, result);
1903              return OPAL_ERROR;
1904          }
1905     }
1906 #if OPAL_ENABLE_DEBUG
1907     if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1908         ts_end = opal_timer_base_get_usec();
1909         accum = mydifftime(ts_start, ts_end);
1910         if (mca_common_cuda_cumemcpy_async) {
1911             opal_output(0, "cuMemcpyAsync took   %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1912                         accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1913         } else {
1914             opal_output(0, "cuMemcpy took   %7.2f usecs, size=%d,  (src=%p (%d), dst=%p (%d))\n",
1915                         accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1916         }
1917     }
1918 #endif
1919     return OPAL_SUCCESS;
1920 }
1921 
1922 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
1923 {
1924     CUdeviceptr tmp;
1925     int result;
1926 
1927     result = cuFunc.cuMemAlloc(&tmp,size);
1928     if (mca_common_cuda_cumemcpy_async) {
1929         result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream);
1930         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1931             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1932                            true, tmp, src, size, result);
1933             return OPAL_ERROR;
1934         }
1935         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream);
1936         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1937             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1938                            true, dest, tmp, size, result);
1939             return OPAL_ERROR;
1940         }
1941         result = cuFunc.cuStreamSynchronize(memcpyStream);
1942         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1943             opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1944                            true, OPAL_PROC_MY_HOSTNAME, result);
1945             return OPAL_ERROR;
1946         }
1947     } else {
1948         result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
1949         if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1950             opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1951                         result, (void *)tmp, src, (int)size);
1952             return OPAL_ERROR;
1953         }
1954         result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
1955         if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1956             opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1957                         result, dest, (void *)tmp, (int)size);
1958             return OPAL_ERROR;
1959         }
1960     }
1961     cuFunc.cuMemFree(tmp);
1962     return OPAL_SUCCESS;
1963 }
1964 
1965 int mca_common_cuda_get_device(int *devicenum)
1966 {
1967     CUdevice cuDev;
1968     int res;
1969 
1970     res = cuFunc.cuCtxGetDevice(&cuDev);
1971     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1972         opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d",
1973                     res);
1974         return res;
1975     }
1976     *devicenum = cuDev;
1977     return 0;
1978 }
1979 
1980 int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2)
1981 {
1982     int res;
1983     res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice)dev1, (CUdevice)dev2);
1984     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1985         opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d",
1986                     res);
1987         return res;
1988     }
1989     return 0;
1990 }
1991 
1992 int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
1993 {
1994     CUresult result;
1995     result = cuFunc.cuMemGetAddressRange((CUdeviceptr *)pbase, psize, (CUdeviceptr)base);
1996     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1997         opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2",
1998                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1999         return OPAL_ERROR;
2000     } else {
2001         opal_output_verbose(50, mca_common_cuda_output,
2002                             "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ",
2003                             base, *(char **)pbase, *psize);
2004     }
2005     return 0;
2006 }
2007 
2008 #if OPAL_CUDA_GDR_SUPPORT
2009 /* Check to see if the memory was freed between the time it was stored in
2010  * the registration cache and now.  Return true if the memory was previously
2011  * freed.  This is indicated by the BUFFER_ID value in the registration cache
2012  * not matching the BUFFER_ID of the buffer we are checking.  Return false
2013  * if the registration is still good.
2014  */
2015 bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg)
2016 {
2017     int res;
2018     unsigned long long bufID;
2019     unsigned char *dbuf = reg->base;
2020 
2021     res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2022                                        (CUdeviceptr)dbuf);
2023     /* If we cannot determine the BUFFER_ID, then print a message and default
2024      * to forcing the registration to be kicked out. */
2025     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2026         opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2027                        true, OPAL_PROC_MY_HOSTNAME, res);
2028         return true;
2029     }
2030     opal_output_verbose(50, mca_common_cuda_output,
2031                         "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, reg->gpu_bufID,
2032                         (reg->gpu_bufID == bufID ? "BUFFER_ID match":"BUFFER_ID do not match"));
2033     if (bufID != reg->gpu_bufID) {
2034         return true;
2035     } else {
2036         return false;
2037     }
2038 }
2039 
2040 /*
2041  * Get the buffer ID from the memory and store it in the registration.
2042  * This is needed to ensure the cached registration is not stale.  If
2043  * we fail to get buffer ID, print an error and set buffer ID to 0.
2044  * Also set SYNC_MEMOPS on any GPU registration to ensure that
2045  * synchronous copies complete before the buffer is accessed.
2046  */
2047 void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
2048 {
2049     int res;
2050     unsigned long long bufID = 0;
2051     unsigned char *dbuf = reg->base;
2052     int enable = 1;
2053 
2054     res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2055                                        (CUdeviceptr)dbuf);
2056     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2057         opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2058                        true, OPAL_PROC_MY_HOSTNAME, res);
2059     }
2060     reg->gpu_bufID = bufID;
2061 
2062     res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
2063                                        (CUdeviceptr)dbuf);
2064     if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) {
2065         opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
2066                        true, OPAL_PROC_MY_HOSTNAME, res, dbuf);
2067     }
2068 }
2069 #endif /* OPAL_CUDA_GDR_SUPPORT */
/* [<][>][^][v][top][bottom][index][help] */
root/opal/mca/common/cuda/common_cuda.c

DEFINITIONS