This source file includes following definitions.
- mca_common_cuda_register_mca_variables
- mca_common_cuda_stage_one_init
- mca_common_cuda_stage_two_init
- mca_common_cuda_stage_three_init
- mca_common_cuda_fini
- mca_common_cuda_register
- mca_common_cuda_unregister
- cuda_getmemhandle
- cuda_ungetmemhandle
- cuda_openmemhandle
- cuda_closememhandle
- mca_common_cuda_construct_event_and_handle
- mca_common_cuda_destruct_event
- mca_common_wait_stream_synchronize
- mca_common_cuda_memcpy
- mca_common_cuda_record_dtoh_event
- mca_common_cuda_record_htod_event
- mca_common_cuda_get_dtoh_stream
- mca_common_cuda_get_htod_stream
- progress_one_cuda_ipc_event
- progress_one_cuda_dtoh_event
- progress_one_cuda_htod_event
- mca_common_cuda_memhandle_matches
- cuda_dump_memhandle
- cuda_dump_evthandle
- mydifftime
- mca_common_cuda_is_gpu_buffer
- mca_common_cuda_cu_memcpy_async
- mca_common_cuda_cu_memcpy
- mca_common_cuda_memmove
- mca_common_cuda_get_device
- mca_common_cuda_device_can_access_peer
- mca_common_cuda_get_address_range
- mca_common_cuda_previously_freed_memory
- mca_common_cuda_get_buffer_id
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 #include "opal_config.h"
29
30 #include <errno.h>
31 #include <unistd.h>
32 #include <cuda.h>
33
34 #include "opal/align.h"
35 #include "opal/datatype/opal_convertor.h"
36 #include "opal/datatype/opal_datatype_cuda.h"
37 #include "opal/util/output.h"
38 #include "opal/util/show_help.h"
39 #include "opal/util/proc.h"
40 #include "opal/util/argv.h"
41 #include "opal/util/printf.h"
42
43 #include "opal/mca/rcache/base/base.h"
44 #include "opal/runtime/opal_params.h"
45 #include "opal/mca/timer/base/base.h"
46 #include "opal/mca/dl/base/base.h"
47
48 #include "common_cuda.h"
49
50
51
52
53
54
55
56
57 #define STRINGIFY2(x) #x
58 #define STRINGIFY(x) STRINGIFY2(x)
59
60 #define OPAL_CUDA_DLSYM(libhandle, funcName) \
61 do { \
62 char *err_msg; \
63 void *ptr; \
64 if (OPAL_SUCCESS != \
65 opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) { \
66 opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, \
67 STRINGIFY(funcName), err_msg); \
68 return 1; \
69 } else { \
70 *(void **)(&cuFunc.funcName) = ptr; \
71 opal_output_verbose(15, mca_common_cuda_output, \
72 "CUDA: successful dlsym of %s", \
73 STRINGIFY(funcName)); \
74 } \
75 } while (0)
76
77
78 struct cudaFunctionTable {
79 int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
80 int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
81 int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
82 int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
83 int (*cuMemFree)(CUdeviceptr buf);
84 int (*cuCtxGetCurrent)(void *cuContext);
85 int (*cuStreamCreate)(CUstream *, int);
86 int (*cuEventCreate)(CUevent *, int);
87 int (*cuEventRecord)(CUevent, CUstream);
88 int (*cuMemHostRegister)(void *, size_t, unsigned int);
89 int (*cuMemHostUnregister)(void *);
90 int (*cuEventQuery)(CUevent);
91 int (*cuEventDestroy)(CUevent);
92 int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
93 int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
94 int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
95 int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
96 int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
97 int (*cuIpcCloseMemHandle)(CUdeviceptr);
98 int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
99 int (*cuCtxGetDevice)(CUdevice *);
100 int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
101 int (*cuDeviceGet)(CUdevice *, int);
102 #if OPAL_CUDA_GDR_SUPPORT
103 int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
104 #endif
105 int (*cuCtxSetCurrent)(CUcontext);
106 int (*cuEventSynchronize)(CUevent);
107 int (*cuStreamSynchronize)(CUstream);
108 int (*cuStreamDestroy)(CUstream);
109 #if OPAL_CUDA_GET_ATTRIBUTES
110 int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
111 #endif
112 };
113 typedef struct cudaFunctionTable cudaFunctionTable_t;
114 static cudaFunctionTable_t cuFunc;
115
116 static int stage_one_init_ref_count = 0;
117 static bool stage_three_init_complete = false;
118 static bool common_cuda_initialized = false;
119 static bool common_cuda_mca_parames_registered = false;
120 static int mca_common_cuda_verbose;
121 static int mca_common_cuda_output = 0;
122 bool mca_common_cuda_enabled = false;
123 static bool mca_common_cuda_register_memory = true;
124 static bool mca_common_cuda_warning = false;
125 static opal_list_t common_cuda_memory_registrations;
126 static CUstream ipcStream = NULL;
127 static CUstream dtohStream = NULL;
128 static CUstream htodStream = NULL;
129 static CUstream memcpyStream = NULL;
130 static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
131 static opal_mutex_t common_cuda_init_lock;
132 static opal_mutex_t common_cuda_htod_lock;
133 static opal_mutex_t common_cuda_dtoh_lock;
134 static opal_mutex_t common_cuda_ipc_lock;
135
136
137 static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
138 static int mca_common_cuda_memmove(void*, void*, size_t);
139 static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
140 static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
141
142
143 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *);
144
145
146
147 struct common_cuda_mem_regs_t {
148 opal_list_item_t super;
149 void *ptr;
150 size_t amount;
151 char *msg;
152 };
153 typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t;
154 OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t);
155 OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
156 opal_list_item_t,
157 NULL,
158 NULL);
159
160 static int mca_common_cuda_async = 1;
161 static int mca_common_cuda_cumemcpy_async;
162 #if OPAL_ENABLE_DEBUG
163 static int mca_common_cuda_cumemcpy_timing;
164 #endif
165
166
167
168 CUevent *cuda_event_ipc_array = NULL;
169 CUevent *cuda_event_dtoh_array = NULL;
170 CUevent *cuda_event_htod_array = NULL;
171
172
173
174 struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
175 struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
176 struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
177
178
179 static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
180
181
182 static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
183
184
185 static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
186
187
188 int cuda_event_max = 400;
189 static int cuda_event_ipc_most = 0;
190 static int cuda_event_dtoh_most = 0;
191 static int cuda_event_htod_most = 0;
192
193
194 opal_dl_handle_t *libcuda_handle = NULL;
195
196
197
198
199
200 static int checkmem;
201 static int ctx_ok = 1;
202
203 #define CUDA_COMMON_TIMING 0
204 #if OPAL_ENABLE_DEBUG
205
206
207 static opal_timer_t ts_start;
208 static opal_timer_t ts_end;
209 static double accum;
210 #define THOUSAND 1000L
211 #define MILLION 1000000L
212 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end);
213 #endif
214
215
216 static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
217 static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
218 #if OPAL_ENABLE_DEBUG
219 #define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
220 #define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
221 #else
222 #define CUDA_DUMP_MEMHANDLE(a)
223 #define CUDA_DUMP_EVTHANDLE(a)
224 #endif
225
226
227
228 void mca_common_cuda_register_mca_variables(void)
229 {
230
231 if (false == common_cuda_mca_parames_registered) {
232 common_cuda_mca_parames_registered = true;
233 }
234
235 mca_common_cuda_verbose = 0;
236 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
237 "Set level of common cuda verbosity",
238 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
239 OPAL_INFO_LVL_9,
240 MCA_BASE_VAR_SCOPE_READONLY,
241 &mca_common_cuda_verbose);
242
243
244
245 mca_common_cuda_register_memory = true;
246 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory",
247 "Whether to cuMemHostRegister preallocated BTL buffers",
248 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
249 OPAL_INFO_LVL_9,
250 MCA_BASE_VAR_SCOPE_READONLY,
251 &mca_common_cuda_register_memory);
252
253
254
255
256 mca_common_cuda_warning = true;
257 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning",
258 "Whether to print warnings when CUDA registration fails",
259 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
260 OPAL_INFO_LVL_9,
261 MCA_BASE_VAR_SCOPE_READONLY,
262 &mca_common_cuda_warning);
263
264
265 mca_common_cuda_async = 1;
266 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
267 "Set to 0 to force CUDA sync copy instead of async",
268 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
269 OPAL_INFO_LVL_9,
270 MCA_BASE_VAR_SCOPE_READONLY,
271 &mca_common_cuda_async);
272
273
274 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max",
275 "Set number of oustanding CUDA events",
276 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
277 OPAL_INFO_LVL_9,
278 MCA_BASE_VAR_SCOPE_READONLY,
279 &cuda_event_max);
280
281
282 mca_common_cuda_cumemcpy_async = 1;
283 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async",
284 "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize",
285 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
286 OPAL_INFO_LVL_5,
287 MCA_BASE_VAR_SCOPE_READONLY,
288 &mca_common_cuda_cumemcpy_async);
289
290 #if OPAL_ENABLE_DEBUG
291
292 mca_common_cuda_cumemcpy_timing = 0;
293 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
294 "Set to 1 to dump timing of eager copies",
295 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
296 OPAL_INFO_LVL_5,
297 MCA_BASE_VAR_SCOPE_READONLY,
298 &mca_common_cuda_cumemcpy_timing);
299 #endif
300
301 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
302 "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
303 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
304 OPAL_INFO_LVL_9,
305 MCA_BASE_VAR_SCOPE_READONLY,
306 &mca_common_cuda_gpu_mem_check_workaround);
307 }
308
309
310
311
312
313
314
315
316
317
318
319 int mca_common_cuda_stage_one_init(void)
320 {
321 int retval, i, j;
322 char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
323 char *searchpaths[] = {"", "/usr/lib64", NULL};
324 char **errmsgs = NULL;
325 char *errmsg = NULL;
326 int errsize;
327 bool stage_one_init_passed = false;
328
329 stage_one_init_ref_count++;
330 if (stage_one_init_ref_count > 1) {
331 opal_output_verbose(10, mca_common_cuda_output,
332 "CUDA: stage_one_init_ref_count is now %d, no need to init",
333 stage_one_init_ref_count);
334 return OPAL_SUCCESS;
335 }
336
337
338 mca_common_cuda_register_mca_variables();
339
340 OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
341 OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
342 OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
343 OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
344
345 mca_common_cuda_output = opal_output_open(NULL);
346 opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
347
348 opal_output_verbose(10, mca_common_cuda_output,
349 "CUDA: stage_one_init_ref_count is now %d, initializing",
350 stage_one_init_ref_count);
351
352
353
354
355 if (!opal_cuda_support) {
356 return 1;
357 }
358
359 if (!OPAL_HAVE_DL_SUPPORT) {
360 opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
361 return 1;
362 }
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383 j = 0;
384 while (searchpaths[j] != NULL) {
385 i = 0;
386 while (cudalibs[i] != NULL) {
387 char *filename = NULL;
388 char *str = NULL;
389
390
391
392 if (strlen(searchpaths[j]) > 0) {
393 opal_asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
394 } else {
395 filename = strdup(cudalibs[i]);
396 }
397 if (NULL == filename) {
398 opal_show_help("help-mpi-common-cuda.txt", "No memory",
399 true, OPAL_PROC_MY_HOSTNAME);
400 return 1;
401 }
402
403 retval = opal_dl_open(filename, false, false,
404 &libcuda_handle, &str);
405 if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
406 if (NULL != str) {
407 opal_argv_append(&errsize, &errmsgs, str);
408 } else {
409 opal_argv_append(&errsize, &errmsgs,
410 "opal_dl_open() returned NULL.");
411 }
412 opal_output_verbose(10, mca_common_cuda_output,
413 "CUDA: Library open error: %s",
414 errmsgs[errsize-1]);
415 } else {
416 opal_output_verbose(10, mca_common_cuda_output,
417 "CUDA: Library successfully opened %s",
418 cudalibs[i]);
419 stage_one_init_passed = true;
420 break;
421 }
422 i++;
423
424 free(filename);
425 }
426 if (true == stage_one_init_passed) {
427 break;
428 }
429 j++;
430 }
431
432 if (true != stage_one_init_passed) {
433 errmsg = opal_argv_join(errmsgs, '\n');
434 if (opal_warn_on_missing_libcuda) {
435 opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
436 errmsg);
437 }
438 opal_cuda_support = 0;
439 }
440 opal_argv_free(errmsgs);
441 free(errmsg);
442
443 if (true != stage_one_init_passed) {
444 return 1;
445 }
446 opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init);
447 OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t);
448
449
450
451 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
452 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
453 OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
454 OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
455 OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
456 OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
457 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
458 OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
459 OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
460 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
461 OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
462 OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
463 OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
464 OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
465 OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
466 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
467 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
468 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
469 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
470 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
471 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
472 OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
473 OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
474 #if OPAL_CUDA_GDR_SUPPORT
475 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
476 #endif
477 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
478 OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
479 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
480 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
481 #if OPAL_CUDA_GET_ATTRIBUTES
482 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
483 #endif
484 return 0;
485 }
486
487
488
489
490
491
492
493
494 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable)
495 {
496 if (OPAL_UNLIKELY(!opal_cuda_support)) {
497 return OPAL_ERROR;
498 }
499
500 ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer;
501 ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
502 ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
503 ftable->gpu_memmove = &mca_common_cuda_memmove;
504
505 opal_output_verbose(30, mca_common_cuda_output,
506 "CUDA: support functions initialized");
507 return OPAL_SUCCESS;
508 }
509
510
511
512
513
514
515
516
517 static int mca_common_cuda_stage_three_init(void)
518 {
519 int i, s, rc;
520 CUresult res;
521 CUcontext cuContext;
522 common_cuda_mem_regs_t *mem_reg;
523
524 OPAL_THREAD_LOCK(&common_cuda_init_lock);
525 opal_output_verbose(20, mca_common_cuda_output,
526 "CUDA: entering stage three init");
527
528
529 if (OPAL_UNLIKELY(!opal_cuda_support)) {
530 opal_output_verbose(20, mca_common_cuda_output,
531 "CUDA: No mpi cuda support, exiting stage three init");
532 stage_three_init_complete = true;
533 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
534 return OPAL_ERROR;
535 }
536
537
538 if (true == stage_three_init_complete) {
539 if (common_cuda_initialized) {
540 opal_output_verbose(20, mca_common_cuda_output,
541 "CUDA: Stage three already complete, exiting stage three init");
542 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
543 return OPAL_SUCCESS;
544 } else {
545 opal_output_verbose(20, mca_common_cuda_output,
546 "CUDA: Stage three already complete, failed during the init");
547 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
548 return OPAL_ERROR;
549 }
550 }
551
552
553
554 res = cuFunc.cuCtxGetCurrent(&cuContext);
555 if (CUDA_SUCCESS != res) {
556 if (mca_common_cuda_warning) {
557
558
559 if (CUDA_ERROR_NOT_INITIALIZED == res) {
560 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized",
561 true);
562 } else {
563 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed",
564 true, res);
565 }
566 }
567 mca_common_cuda_enabled = false;
568 mca_common_cuda_register_memory = false;
569 } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) {
570 if (mca_common_cuda_warning) {
571 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL",
572 true);
573 }
574 mca_common_cuda_enabled = false;
575 mca_common_cuda_register_memory = false;
576 } else {
577
578
579
580 mca_common_cuda_enabled = true;
581 opal_output_verbose(20, mca_common_cuda_output,
582 "CUDA: cuCtxGetCurrent succeeded");
583 }
584
585
586
587
588
589 if (false == mca_common_cuda_enabled) {
590 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
591 return OPAL_ERROR;
592 }
593
594 if (true == mca_common_cuda_enabled) {
595
596 cuda_event_ipc_num_used = 0;
597 cuda_event_ipc_first_avail = 0;
598 cuda_event_ipc_first_used = 0;
599
600 cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
601 if (NULL == cuda_event_ipc_array) {
602 opal_show_help("help-mpi-common-cuda.txt", "No memory",
603 true, OPAL_PROC_MY_HOSTNAME);
604 rc = OPAL_ERROR;
605 goto cleanup_and_error;
606 }
607
608
609 for (i = 0; i < cuda_event_max; i++) {
610 res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
611 if (CUDA_SUCCESS != res) {
612 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
613 true, OPAL_PROC_MY_HOSTNAME, res);
614 rc = OPAL_ERROR;
615 goto cleanup_and_error;
616 }
617 }
618
619
620
621 cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
622 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
623 if (NULL == cuda_event_ipc_frag_array) {
624 opal_show_help("help-mpi-common-cuda.txt", "No memory",
625 true, OPAL_PROC_MY_HOSTNAME);
626 rc = OPAL_ERROR;
627 goto cleanup_and_error;
628 }
629 }
630
631 if (true == mca_common_cuda_enabled) {
632
633
634 cuda_event_dtoh_num_used = 0;
635 cuda_event_dtoh_first_avail = 0;
636 cuda_event_dtoh_first_used = 0;
637
638 cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
639 if (NULL == cuda_event_dtoh_array) {
640 opal_show_help("help-mpi-common-cuda.txt", "No memory",
641 true, OPAL_PROC_MY_HOSTNAME);
642 rc = OPAL_ERROR;
643 goto cleanup_and_error;
644 }
645
646
647 for (i = 0; i < cuda_event_max; i++) {
648 res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
649 if (CUDA_SUCCESS != res) {
650 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
651 true, OPAL_PROC_MY_HOSTNAME, res);
652 rc = OPAL_ERROR;
653 goto cleanup_and_error;
654 }
655 }
656
657
658
659 cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **)
660 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
661 if (NULL == cuda_event_dtoh_frag_array) {
662 opal_show_help("help-mpi-common-cuda.txt", "No memory",
663 true, OPAL_PROC_MY_HOSTNAME);
664 rc = OPAL_ERROR;
665 goto cleanup_and_error;
666 }
667
668
669
670 cuda_event_htod_num_used = 0;
671 cuda_event_htod_first_avail = 0;
672 cuda_event_htod_first_used = 0;
673
674 cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
675 if (NULL == cuda_event_htod_array) {
676 opal_show_help("help-mpi-common-cuda.txt", "No memory",
677 true, OPAL_PROC_MY_HOSTNAME);
678 rc = OPAL_ERROR;
679 goto cleanup_and_error;
680 }
681
682
683 for (i = 0; i < cuda_event_max; i++) {
684 res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
685 if (CUDA_SUCCESS != res) {
686 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
687 true, OPAL_PROC_MY_HOSTNAME, res);
688 rc = OPAL_ERROR;
689 goto cleanup_and_error;
690 }
691 }
692
693
694
695 cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **)
696 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
697 if (NULL == cuda_event_htod_frag_array) {
698 opal_show_help("help-mpi-common-cuda.txt", "No memory",
699 true, OPAL_PROC_MY_HOSTNAME);
700 rc = OPAL_ERROR;
701 goto cleanup_and_error;
702 }
703 }
704
705 s = opal_list_get_size(&common_cuda_memory_registrations);
706 for(i = 0; i < s; i++) {
707 mem_reg = (common_cuda_mem_regs_t *)
708 opal_list_remove_first(&common_cuda_memory_registrations);
709 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
710 res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
711 if (res != CUDA_SUCCESS) {
712
713
714 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
715 true, mem_reg->ptr, mem_reg->amount,
716 OPAL_PROC_MY_HOSTNAME, res, mem_reg->msg);
717 } else {
718 opal_output_verbose(20, mca_common_cuda_output,
719 "CUDA: cuMemHostRegister OK on rcache %s: "
720 "address=%p, bufsize=%d",
721 mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount);
722 }
723 }
724 free(mem_reg->msg);
725 OBJ_RELEASE(mem_reg);
726 }
727
728
729 res = cuFunc.cuStreamCreate(&ipcStream, 0);
730 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
731 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
732 true, OPAL_PROC_MY_HOSTNAME, res);
733 rc = OPAL_ERROR;
734 goto cleanup_and_error;
735 }
736
737
738 res = cuFunc.cuStreamCreate(&dtohStream, 0);
739 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
740 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
741 true, OPAL_PROC_MY_HOSTNAME, res);
742 rc = OPAL_ERROR;
743 goto cleanup_and_error;
744 }
745
746
747 res = cuFunc.cuStreamCreate(&htodStream, 0);
748 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
749 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
750 true, OPAL_PROC_MY_HOSTNAME, res);
751 rc = OPAL_ERROR;
752 goto cleanup_and_error;
753 }
754
755 if (mca_common_cuda_cumemcpy_async) {
756
757 res = cuFunc.cuStreamCreate(&memcpyStream, 0);
758 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
759 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
760 true, OPAL_PROC_MY_HOSTNAME, res);
761 rc = OPAL_ERROR;
762 goto cleanup_and_error;
763 }
764 }
765
766 res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
767 if (res != CUDA_SUCCESS) {
768
769
770 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
771 true, &checkmem, sizeof(int),
772 OPAL_PROC_MY_HOSTNAME, res, "checkmem");
773
774 } else {
775 opal_output_verbose(20, mca_common_cuda_output,
776 "CUDA: cuMemHostRegister OK on test region");
777 }
778
779 opal_output_verbose(20, mca_common_cuda_output,
780 "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
781
782 opal_output_verbose(30, mca_common_cuda_output,
783 "CUDA: initialized");
784 opal_atomic_mb();
785 common_cuda_initialized = true;
786 stage_three_init_complete = true;
787 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
788 return OPAL_SUCCESS;
789
790
791 cleanup_and_error:
792 opal_atomic_mb();
793 stage_three_init_complete = true;
794 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
795 return rc;
796 }
797
798
799
800
801
802
803
804
805
806 void mca_common_cuda_fini(void)
807 {
808 int i;
809 CUresult res;
810
811 if (false == common_cuda_initialized) {
812 stage_one_init_ref_count--;
813 opal_output_verbose(20, mca_common_cuda_output,
814 "CUDA: mca_common_cuda_fini, never completed initialization so "
815 "skipping fini, ref_count is now %d", stage_one_init_ref_count);
816 return;
817 }
818
819 if (0 == stage_one_init_ref_count) {
820 opal_output_verbose(20, mca_common_cuda_output,
821 "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
822 stage_one_init_ref_count);
823 return;
824 }
825
826 if (1 == stage_one_init_ref_count) {
827 opal_output_verbose(20, mca_common_cuda_output,
828 "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
829 stage_one_init_ref_count);
830
831
832
833
834
835
836 res = cuFunc.cuMemHostUnregister(&checkmem);
837 if (CUDA_SUCCESS != res) {
838 ctx_ok = 0;
839 }
840 opal_output_verbose(20, mca_common_cuda_output,
841 "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
842 res, ctx_ok);
843
844 if (NULL != cuda_event_ipc_array) {
845 if (ctx_ok) {
846 for (i = 0; i < cuda_event_max; i++) {
847 if (NULL != cuda_event_ipc_array[i]) {
848 cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
849 }
850 }
851 }
852 free(cuda_event_ipc_array);
853 }
854 if (NULL != cuda_event_htod_array) {
855 if (ctx_ok) {
856 for (i = 0; i < cuda_event_max; i++) {
857 if (NULL != cuda_event_htod_array[i]) {
858 cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
859 }
860 }
861 }
862 free(cuda_event_htod_array);
863 }
864
865 if (NULL != cuda_event_dtoh_array) {
866 if (ctx_ok) {
867 for (i = 0; i < cuda_event_max; i++) {
868 if (NULL != cuda_event_dtoh_array[i]) {
869 cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
870 }
871 }
872 }
873 free(cuda_event_dtoh_array);
874 }
875
876 if (NULL != cuda_event_ipc_frag_array) {
877 free(cuda_event_ipc_frag_array);
878 }
879 if (NULL != cuda_event_htod_frag_array) {
880 free(cuda_event_htod_frag_array);
881 }
882 if (NULL != cuda_event_dtoh_frag_array) {
883 free(cuda_event_dtoh_frag_array);
884 }
885 if ((NULL != ipcStream) && ctx_ok) {
886 cuFunc.cuStreamDestroy(ipcStream);
887 }
888 if ((NULL != dtohStream) && ctx_ok) {
889 cuFunc.cuStreamDestroy(dtohStream);
890 }
891 if ((NULL != htodStream) && ctx_ok) {
892 cuFunc.cuStreamDestroy(htodStream);
893 }
894 if ((NULL != memcpyStream) && ctx_ok) {
895 cuFunc.cuStreamDestroy(memcpyStream);
896 }
897 OBJ_DESTRUCT(&common_cuda_init_lock);
898 OBJ_DESTRUCT(&common_cuda_htod_lock);
899 OBJ_DESTRUCT(&common_cuda_dtoh_lock);
900 OBJ_DESTRUCT(&common_cuda_ipc_lock);
901 if (NULL != libcuda_handle) {
902 opal_dl_close(libcuda_handle);
903 }
904
905 opal_output_verbose(20, mca_common_cuda_output,
906 "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
907 stage_one_init_ref_count);
908
909 opal_output_close(mca_common_cuda_output);
910
911 } else {
912 opal_output_verbose(20, mca_common_cuda_output,
913 "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
914 stage_one_init_ref_count);
915 }
916 stage_one_init_ref_count--;
917 }
918
919
920
921
922
923 void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
924 int res;
925
926
927 if (!opal_cuda_support)
928 return;
929
930 if (!common_cuda_initialized) {
931 OPAL_THREAD_LOCK(&common_cuda_init_lock);
932 if (!common_cuda_initialized) {
933 common_cuda_mem_regs_t *regptr;
934 regptr = OBJ_NEW(common_cuda_mem_regs_t);
935 regptr->ptr = ptr;
936 regptr->amount = amount;
937 regptr->msg = strdup(msg);
938 opal_list_append(&common_cuda_memory_registrations,
939 (opal_list_item_t*)regptr);
940 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
941 return;
942 }
943 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
944 }
945
946 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
947 res = cuFunc.cuMemHostRegister(ptr, amount, 0);
948 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
949
950
951 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed",
952 true, ptr, amount,
953 OPAL_PROC_MY_HOSTNAME, res, msg);
954 } else {
955 opal_output_verbose(20, mca_common_cuda_output,
956 "CUDA: cuMemHostRegister OK on rcache %s: "
957 "address=%p, bufsize=%d",
958 msg, ptr, (int)amount);
959 }
960 }
961 }
962
963
964
965
966
967 void mca_common_cuda_unregister(void *ptr, char *msg) {
968 int res, i, s;
969 common_cuda_mem_regs_t *mem_reg;
970
971
972
973
974 if (!common_cuda_initialized) {
975 s = opal_list_get_size(&common_cuda_memory_registrations);
976 for(i = 0; i < s; i++) {
977 mem_reg = (common_cuda_mem_regs_t *)
978 opal_list_remove_first(&common_cuda_memory_registrations);
979 free(mem_reg->msg);
980 OBJ_RELEASE(mem_reg);
981 }
982 return;
983 }
984
985 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
986 res = cuFunc.cuMemHostUnregister(ptr);
987 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
988
989
990 opal_output_verbose(20, mca_common_cuda_output,
991 "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s",
992 ptr, res, msg);
993
994 } else {
995 opal_output_verbose(20, mca_common_cuda_output,
996 "CUDA: cuMemHostUnregister OK on rcache %s: "
997 "address=%p",
998 msg, ptr);
999 }
1000 }
1001 }
1002
1003
1004
1005
1006
1007
1008 int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1009 mca_rcache_base_registration_t *hdrreg)
1010
1011 {
1012 CUmemorytype memType;
1013 CUresult result;
1014 CUipcMemHandle *memHandle;
1015 CUdeviceptr pbase;
1016 size_t psize;
1017
1018 mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)newreg;
1019 memHandle = (CUipcMemHandle *)cuda_reg->data.memHandle;
1020
1021
1022 result = cuFunc.cuPointerGetAttribute(&memType,
1023 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
1024 assert(CUDA_SUCCESS == result);
1025 assert(CU_MEMORYTYPE_DEVICE == memType);
1026
1027
1028 result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr)base);
1029 CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After"));
1030
1031 if (CUDA_SUCCESS != result) {
1032 opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed",
1033 true, result, base);
1034 return OPAL_ERROR;
1035 } else {
1036 opal_output_verbose(20, mca_common_cuda_output,
1037 "CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
1038 base, (int)size);
1039 }
1040
1041
1042
1043 result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
1044 if (CUDA_SUCCESS != result) {
1045 opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
1046 true, result, base);
1047 return OPAL_ERROR;
1048 } else {
1049 opal_output_verbose(10, mca_common_cuda_output,
1050 "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
1051 base, (int)size, (void *)pbase, (int)psize);
1052 }
1053
1054
1055 cuda_reg->base.base = (void *)pbase;
1056 cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
1057 cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
1058 cuda_reg->data.memh_seg_len = psize;
1059
1060 #if OPAL_CUDA_SYNC_MEMOPS
1061
1062
1063
1064
1065
1066 memType = 1;
1067 result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
1068 (CUdeviceptr)base);
1069 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1070 opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
1071 true, OPAL_PROC_MY_HOSTNAME, result, base);
1072 return OPAL_ERROR;
1073 }
1074 #else
1075
1076
1077
1078
1079
1080
1081
1082 result = cuFunc.cuEventRecord((CUevent)cuda_reg->data.event, 0);
1083 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1084 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1085 true, result, base);
1086 return OPAL_ERROR;
1087 }
1088 #endif
1089
1090 return OPAL_SUCCESS;
1091 }
1092
1093
1094
1095
1096
1097 int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1098 {
1099 opal_output_verbose(10, mca_common_cuda_output,
1100 "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
1101 CUDA_DUMP_MEMHANDLE((100, ((mca_rcache_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle"));
1102
1103 return OPAL_SUCCESS;
1104 }
1105
1106
1107
1108
1109
1110
1111
1112 int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1113 mca_rcache_base_registration_t *hdrreg)
1114 {
1115 CUresult result;
1116 CUipcMemHandle *memHandle;
1117 mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t*)newreg;
1118
1119
1120 memHandle = (CUipcMemHandle *)cuda_newreg->data.memHandle;
1121 CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle"));
1122
1123
1124 result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, *memHandle,
1125 CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
1126
1127
1128
1129
1130 if (CUDA_ERROR_ALREADY_MAPPED == result) {
1131 opal_output_verbose(10, mca_common_cuda_output,
1132 "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
1133 "p=%p,size=%d: notify memory pool\n", base, (int)size);
1134 return OPAL_ERR_WOULD_BLOCK;
1135 }
1136 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1137 opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed",
1138 true, OPAL_PROC_MY_HOSTNAME, result, base);
1139
1140 return OPAL_ERROR;
1141 } else {
1142 opal_output_verbose(10, mca_common_cuda_output,
1143 "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
1144 newreg->alloc_base, base, (int)size);
1145 CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle"));
1146 }
1147
1148 return OPAL_SUCCESS;
1149 }
1150
1151
1152
1153
1154 int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1155 {
1156 CUresult result;
1157 mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)reg;
1158
1159
1160
1161 if (ctx_ok) {
1162 result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
1163 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1164 if (CUDA_ERROR_DEINITIALIZED != result) {
1165 opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1166 true, result, cuda_reg->base.alloc_base);
1167 }
1168
1169 } else {
1170 opal_output_verbose(10, mca_common_cuda_output,
1171 "CUDA: cuIpcCloseMemHandle passed: base=%p",
1172 cuda_reg->base.alloc_base);
1173 CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
1174 }
1175 }
1176
1177 return OPAL_SUCCESS;
1178 }
1179
1180 void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle)
1181 {
1182 CUresult result;
1183
1184 result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
1185 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1186 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
1187 true, OPAL_PROC_MY_HOSTNAME, result);
1188 }
1189
1190 result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
1191 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1192 opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
1193 true, result);
1194 }
1195
1196 CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
1197
1198 }
1199
1200 void mca_common_cuda_destruct_event(uintptr_t event)
1201 {
1202 CUresult result;
1203
1204
1205
1206 if (ctx_ok) {
1207 result = cuFunc.cuEventDestroy((CUevent)event);
1208 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1209 opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1210 true, result);
1211 }
1212 }
1213 }
1214
1215
1216
1217
1218
1219
1220 void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg)
1221 {
1222 #if OPAL_CUDA_SYNC_MEMOPS
1223
1224 return;
1225 #else
1226 CUipcEventHandle evtHandle;
1227 CUevent event;
1228 CUresult result;
1229
1230 memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1231 CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
1232
1233 result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
1234 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1235 opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
1236 true, result);
1237 }
1238
1239
1240
1241
1242
1243
1244 result = cuFunc.cuEventRecord(event, 0);
1245 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1246 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1247 true, OPAL_PROC_MY_HOSTNAME, result);
1248 }
1249
1250
1251 result = cuFunc.cuStreamWaitEvent(0, event, 0);
1252 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1253 opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
1254 true, result);
1255 }
1256
1257
1258 result = cuFunc.cuEventDestroy(event);
1259 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1260 opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1261 true, result);
1262 }
1263 #endif
1264 }
1265
1266
1267
1268
1269
1270 int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1271 struct mca_btl_base_descriptor_t *frag, int *done)
1272 {
1273 CUresult result;
1274 int iter;
1275
1276 OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1277
1278
1279
1280 if (cuda_event_ipc_num_used == cuda_event_max) {
1281 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1282 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1283 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1284 return OPAL_ERR_OUT_OF_RESOURCE;
1285 }
1286
1287 if (cuda_event_ipc_num_used > cuda_event_ipc_most) {
1288 cuda_event_ipc_most = cuda_event_ipc_num_used;
1289
1290 if (0 == (cuda_event_ipc_most % 10)) {
1291 opal_output_verbose(20, mca_common_cuda_output,
1292 "Maximum ipc events used is now %d", cuda_event_ipc_most);
1293 }
1294 }
1295
1296
1297
1298 if (OPAL_LIKELY(mca_common_cuda_async)) {
1299 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1300 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1301 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1302 true, dst, src, amount, result);
1303 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1304 return OPAL_ERROR;
1305 } else {
1306 opal_output_verbose(20, mca_common_cuda_output,
1307 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1308 dst, src, (int)amount);
1309 }
1310 result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1311 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1312 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1313 true, OPAL_PROC_MY_HOSTNAME, result);
1314 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1315 return OPAL_ERROR;
1316 }
1317 cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1318
1319
1320 cuda_event_ipc_first_avail++;
1321 if (cuda_event_ipc_first_avail >= cuda_event_max) {
1322 cuda_event_ipc_first_avail = 0;
1323 }
1324 cuda_event_ipc_num_used++;
1325
1326 *done = 0;
1327 } else {
1328
1329 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1330 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1331 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1332 true, dst, src, amount, result);
1333 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1334 return OPAL_ERROR;
1335 } else {
1336 opal_output_verbose(20, mca_common_cuda_output,
1337 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1338 dst, src, (int)amount);
1339 }
1340
1341
1342 result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1343 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1344 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1345 true, OPAL_PROC_MY_HOSTNAME, result);
1346 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1347 return OPAL_ERROR;
1348 }
1349
1350 cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1351
1352
1353 cuda_event_ipc_first_avail++;
1354 if (cuda_event_ipc_first_avail >= cuda_event_max) {
1355 cuda_event_ipc_first_avail = 0;
1356 }
1357 cuda_event_ipc_num_used++;
1358
1359 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1360 if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1361 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1362 true, result);
1363 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1364 return OPAL_ERROR;
1365 }
1366
1367 iter = 0;
1368 while (CUDA_ERROR_NOT_READY == result) {
1369 if (0 == (iter % 10)) {
1370 opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
1371 }
1372 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1373 if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1374 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1375 true, result);
1376 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1377 return OPAL_ERROR;
1378 }
1379 iter++;
1380 }
1381
1382 --cuda_event_ipc_num_used;
1383 ++cuda_event_ipc_first_used;
1384 if (cuda_event_ipc_first_used >= cuda_event_max) {
1385 cuda_event_ipc_first_used = 0;
1386 }
1387 *done = 1;
1388 }
1389 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1390 return OPAL_SUCCESS;
1391 }
1392
1393
1394
1395
1396
1397 int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1398 {
1399 CUresult result;
1400
1401
1402
1403
1404 OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1405 if (cuda_event_dtoh_num_used == cuda_event_max) {
1406 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1407 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1408 return OPAL_ERR_OUT_OF_RESOURCE;
1409 }
1410
1411 if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) {
1412 cuda_event_dtoh_most = cuda_event_dtoh_num_used;
1413
1414 if (0 == (cuda_event_dtoh_most % 10)) {
1415 opal_output_verbose(20, mca_common_cuda_output,
1416 "Maximum DtoH events used is now %d", cuda_event_dtoh_most);
1417 }
1418 }
1419
1420 result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
1421 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1422 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1423 true, OPAL_PROC_MY_HOSTNAME, result);
1424 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1425 return OPAL_ERROR;
1426 }
1427 cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
1428
1429
1430 cuda_event_dtoh_first_avail++;
1431 if (cuda_event_dtoh_first_avail >= cuda_event_max) {
1432 cuda_event_dtoh_first_avail = 0;
1433 }
1434 cuda_event_dtoh_num_used++;
1435
1436 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1437 return OPAL_SUCCESS;
1438 }
1439
1440
1441
1442
1443
1444 int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1445 {
1446 CUresult result;
1447
1448 OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1449
1450
1451
1452 if (cuda_event_htod_num_used == cuda_event_max) {
1453 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1454 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1455 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1456 return OPAL_ERR_OUT_OF_RESOURCE;
1457 }
1458
1459 if (cuda_event_htod_num_used > cuda_event_htod_most) {
1460 cuda_event_htod_most = cuda_event_htod_num_used;
1461
1462 if (0 == (cuda_event_htod_most % 10)) {
1463 opal_output_verbose(20, mca_common_cuda_output,
1464 "Maximum HtoD events used is now %d", cuda_event_htod_most);
1465 }
1466 }
1467
1468 result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
1469 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1470 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1471 true, OPAL_PROC_MY_HOSTNAME, result);
1472 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1473 return OPAL_ERROR;
1474 }
1475 cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
1476
1477
1478 cuda_event_htod_first_avail++;
1479 if (cuda_event_htod_first_avail >= cuda_event_max) {
1480 cuda_event_htod_first_avail = 0;
1481 }
1482 cuda_event_htod_num_used++;
1483
1484 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1485 return OPAL_SUCCESS;
1486 }
1487
1488
1489
1490
1491 void *mca_common_cuda_get_dtoh_stream(void) {
1492 return (void *)dtohStream;
1493 }
1494
1495
1496
1497
1498 void *mca_common_cuda_get_htod_stream(void) {
1499 return (void *)htodStream;
1500 }
1501
1502
1503
1504
1505
1506
1507 int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
1508 CUresult result;
1509
1510 OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1511 if (cuda_event_ipc_num_used > 0) {
1512 opal_output_verbose(20, mca_common_cuda_output,
1513 "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
1514 cuda_event_ipc_num_used);
1515
1516 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1517
1518
1519 if (CUDA_ERROR_NOT_READY == result) {
1520 opal_output_verbose(20, mca_common_cuda_output,
1521 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1522 *frag = NULL;
1523 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1524 return 0;
1525 } else if (CUDA_SUCCESS != result) {
1526 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1527 true, result);
1528 *frag = NULL;
1529 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1530 return OPAL_ERROR;
1531 }
1532
1533 *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
1534 opal_output_verbose(10, mca_common_cuda_output,
1535 "CUDA: cuEventQuery returned %d", result);
1536
1537
1538 --cuda_event_ipc_num_used;
1539 ++cuda_event_ipc_first_used;
1540 if (cuda_event_ipc_first_used >= cuda_event_max) {
1541 cuda_event_ipc_first_used = 0;
1542 }
1543
1544 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1545 return 1;
1546 }
1547 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1548 return 0;
1549 }
1550
1551
1552
1553
1554 int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
1555 CUresult result;
1556
1557 OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1558 if (cuda_event_dtoh_num_used > 0) {
1559 opal_output_verbose(30, mca_common_cuda_output,
1560 "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
1561 cuda_event_dtoh_num_used);
1562
1563 result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
1564
1565
1566 if (CUDA_ERROR_NOT_READY == result) {
1567 opal_output_verbose(30, mca_common_cuda_output,
1568 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1569 *frag = NULL;
1570 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1571 return 0;
1572 } else if (CUDA_SUCCESS != result) {
1573 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1574 true, result);
1575 *frag = NULL;
1576 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1577 return OPAL_ERROR;
1578 }
1579
1580 *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
1581 opal_output_verbose(30, mca_common_cuda_output,
1582 "CUDA: cuEventQuery returned %d", result);
1583
1584
1585 --cuda_event_dtoh_num_used;
1586 ++cuda_event_dtoh_first_used;
1587 if (cuda_event_dtoh_first_used >= cuda_event_max) {
1588 cuda_event_dtoh_first_used = 0;
1589 }
1590
1591 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1592 return 1;
1593 }
1594 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1595 return 0;
1596 }
1597
1598
1599
1600
1601 int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
1602 CUresult result;
1603
1604 OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1605 if (cuda_event_htod_num_used > 0) {
1606 opal_output_verbose(30, mca_common_cuda_output,
1607 "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
1608 cuda_event_htod_num_used);
1609
1610 result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
1611
1612
1613 if (CUDA_ERROR_NOT_READY == result) {
1614 opal_output_verbose(30, mca_common_cuda_output,
1615 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1616 *frag = NULL;
1617 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1618 return 0;
1619 } else if (CUDA_SUCCESS != result) {
1620 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1621 true, result);
1622 *frag = NULL;
1623 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1624 return OPAL_ERROR;
1625 }
1626
1627 *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used];
1628 opal_output_verbose(30, mca_common_cuda_output,
1629 "CUDA: cuEventQuery returned %d", result);
1630
1631
1632 --cuda_event_htod_num_used;
1633 ++cuda_event_htod_first_used;
1634 if (cuda_event_htod_first_used >= cuda_event_max) {
1635 cuda_event_htod_first_used = 0;
1636 }
1637
1638 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1639 return 1;
1640 }
1641 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1642 return OPAL_ERR_RESOURCE_BUSY;
1643 }
1644
1645
1646
1647
1648
1649
1650 int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
1651 mca_rcache_common_cuda_reg_t *old_reg)
1652 {
1653
1654 if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) {
1655 return 1;
1656 } else {
1657 return 0;
1658 }
1659
1660 }
1661
1662
1663
1664
1665
1666 static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) {
1667
1668 struct InterprocessMemHandleInternal
1669 {
1670
1671 int64_t ctxId;
1672 int pid;
1673
1674 int64_t size;
1675 int64_t blocksize;
1676 int64_t offset;
1677 int gpuId;
1678 int subDeviceIndex;
1679 int64_t serial;
1680 } memH;
1681
1682 if (NULL == str) {
1683 str = "CUDA";
1684 }
1685 memcpy(&memH, memHandle, sizeof(memH));
1686 opal_output_verbose(verbose, mca_common_cuda_output,
1687 "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64 ", offset=%"
1688 PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64,
1689 str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset,
1690 memH.gpuId, memH.subDeviceIndex, memH.serial);
1691 }
1692
1693
1694
1695
1696
1697 static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
1698
1699 struct InterprocessEventHandleInternal
1700 {
1701 unsigned long pid;
1702 unsigned long serial;
1703 int index;
1704 } evtH;
1705
1706 if (NULL == str) {
1707 str = "CUDA";
1708 }
1709 memcpy(&evtH, evtHandle, sizeof(evtH));
1710 opal_output_verbose(verbose, mca_common_cuda_output,
1711 "CUDA: %s:pid=%lu, serial=%lu, index=%d",
1712 str, evtH.pid, evtH.serial, evtH.index);
1713 }
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729 #if OPAL_ENABLE_DEBUG
1730 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
1731 return (ts_end - ts_start);
1732 }
1733 #endif
1734
1735
1736 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
1737 {
1738 int res;
1739 CUmemorytype memType = 0;
1740 CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
1741 CUcontext ctx = NULL, memCtx = NULL;
1742 #if OPAL_CUDA_GET_ATTRIBUTES
1743 uint32_t isManaged = 0;
1744
1745 CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
1746 CU_POINTER_ATTRIBUTE_CONTEXT,
1747 CU_POINTER_ATTRIBUTE_IS_MANAGED};
1748 void *attrdata[] = {(void *)&memType, (void *)&memCtx, (void *)&isManaged};
1749
1750 res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
1751 OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output,
1752 "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d",
1753 (void *)dbuf, (int)memType, (void *)memCtx, isManaged, res));
1754
1755
1756
1757
1758 if (1 == isManaged) {
1759 if (NULL != convertor) {
1760 convertor->flags |= CONVERTOR_CUDA_UNIFIED;
1761 }
1762 }
1763 if (res != CUDA_SUCCESS) {
1764
1765
1766 return 0;
1767 } else if (memType == CU_MEMORYTYPE_HOST) {
1768
1769 return 0;
1770 } else if (memType == 0) {
1771
1772 return 0;
1773 }
1774
1775 assert(memType == CU_MEMORYTYPE_DEVICE);
1776 #else
1777 res = cuFunc.cuPointerGetAttribute(&memType,
1778 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
1779 if (res != CUDA_SUCCESS) {
1780
1781
1782 return 0;
1783 } else if (memType == CU_MEMORYTYPE_HOST) {
1784
1785 return 0;
1786 }
1787
1788 assert(memType == CU_MEMORYTYPE_DEVICE);
1789 #endif
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799 res = cuFunc.cuCtxGetCurrent(&ctx);
1800 if (OPAL_UNLIKELY(NULL == ctx)) {
1801 if (CUDA_SUCCESS == res) {
1802 #if !OPAL_CUDA_GET_ATTRIBUTES
1803 res = cuFunc.cuPointerGetAttribute(&memCtx,
1804 CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
1805 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1806 opal_output(0, "CUDA: error calling cuPointerGetAttribute: "
1807 "res=%d, ptr=%p aborting...", res, pUserBuf);
1808 return OPAL_ERROR;
1809 }
1810 #endif
1811 res = cuFunc.cuCtxSetCurrent(memCtx);
1812 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1813 opal_output(0, "CUDA: error calling cuCtxSetCurrent: "
1814 "res=%d, ptr=%p aborting...", res, pUserBuf);
1815 return OPAL_ERROR;
1816 } else {
1817 OPAL_OUTPUT_VERBOSE((10, mca_common_cuda_output,
1818 "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf));
1819 }
1820 } else {
1821
1822 opal_output(0, "CUDA: error calling cuCtxGetCurrent: "
1823 "res=%d, ptr=%p aborting...", res, pUserBuf);
1824 return OPAL_ERROR;
1825 }
1826 }
1827
1828
1829
1830
1831
1832
1833
1834 if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
1835 CUdeviceptr pbase;
1836 size_t psize;
1837 res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
1838 if (CUDA_SUCCESS != res) {
1839 opal_output_verbose(5, mca_common_cuda_output,
1840 "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
1841 "Overriding check and setting to host pointer. ",
1842 res, (void *)dbuf);
1843
1844 return 0;
1845 }
1846 }
1847
1848
1849
1850 if (!stage_three_init_complete) {
1851 if (0 != mca_common_cuda_stage_three_init()) {
1852 opal_cuda_support = 0;
1853 }
1854 }
1855
1856 return 1;
1857 }
1858
1859 static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
1860 opal_convertor_t* convertor)
1861 {
1862 return cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
1863 (CUstream)convertor->stream);
1864 }
1865
1866
1867
1868
1869
1870 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
1871 {
1872 CUresult result;
1873 #if OPAL_ENABLE_DEBUG
1874 CUmemorytype memTypeSrc, memTypeDst;
1875 if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1876
1877
1878 result = cuFunc.cuPointerGetAttribute(&memTypeDst,
1879 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
1880 result = cuFunc.cuPointerGetAttribute(&memTypeSrc,
1881 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
1882 ts_start = opal_timer_base_get_usec();
1883 }
1884 #endif
1885 if (mca_common_cuda_cumemcpy_async) {
1886 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream);
1887 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1888 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1889 true, dest, src, size, result);
1890 return OPAL_ERROR;
1891 }
1892 result = cuFunc.cuStreamSynchronize(memcpyStream);
1893 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1894 opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1895 true, OPAL_PROC_MY_HOSTNAME, result);
1896 return OPAL_ERROR;
1897 }
1898 } else {
1899 result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
1900 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1901 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
1902 true, OPAL_PROC_MY_HOSTNAME, result);
1903 return OPAL_ERROR;
1904 }
1905 }
1906 #if OPAL_ENABLE_DEBUG
1907 if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1908 ts_end = opal_timer_base_get_usec();
1909 accum = mydifftime(ts_start, ts_end);
1910 if (mca_common_cuda_cumemcpy_async) {
1911 opal_output(0, "cuMemcpyAsync took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1912 accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1913 } else {
1914 opal_output(0, "cuMemcpy took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1915 accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1916 }
1917 }
1918 #endif
1919 return OPAL_SUCCESS;
1920 }
1921
1922 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
1923 {
1924 CUdeviceptr tmp;
1925 int result;
1926
1927 result = cuFunc.cuMemAlloc(&tmp,size);
1928 if (mca_common_cuda_cumemcpy_async) {
1929 result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream);
1930 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1931 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1932 true, tmp, src, size, result);
1933 return OPAL_ERROR;
1934 }
1935 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream);
1936 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1937 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1938 true, dest, tmp, size, result);
1939 return OPAL_ERROR;
1940 }
1941 result = cuFunc.cuStreamSynchronize(memcpyStream);
1942 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1943 opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1944 true, OPAL_PROC_MY_HOSTNAME, result);
1945 return OPAL_ERROR;
1946 }
1947 } else {
1948 result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
1949 if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1950 opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1951 result, (void *)tmp, src, (int)size);
1952 return OPAL_ERROR;
1953 }
1954 result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
1955 if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1956 opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1957 result, dest, (void *)tmp, (int)size);
1958 return OPAL_ERROR;
1959 }
1960 }
1961 cuFunc.cuMemFree(tmp);
1962 return OPAL_SUCCESS;
1963 }
1964
1965 int mca_common_cuda_get_device(int *devicenum)
1966 {
1967 CUdevice cuDev;
1968 int res;
1969
1970 res = cuFunc.cuCtxGetDevice(&cuDev);
1971 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1972 opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d",
1973 res);
1974 return res;
1975 }
1976 *devicenum = cuDev;
1977 return 0;
1978 }
1979
1980 int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2)
1981 {
1982 int res;
1983 res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice)dev1, (CUdevice)dev2);
1984 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1985 opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d",
1986 res);
1987 return res;
1988 }
1989 return 0;
1990 }
1991
1992 int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
1993 {
1994 CUresult result;
1995 result = cuFunc.cuMemGetAddressRange((CUdeviceptr *)pbase, psize, (CUdeviceptr)base);
1996 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1997 opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2",
1998 true, OPAL_PROC_MY_HOSTNAME, result, base);
1999 return OPAL_ERROR;
2000 } else {
2001 opal_output_verbose(50, mca_common_cuda_output,
2002 "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ",
2003 base, *(char **)pbase, *psize);
2004 }
2005 return 0;
2006 }
2007
2008 #if OPAL_CUDA_GDR_SUPPORT
2009
2010
2011
2012
2013
2014
2015 bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg)
2016 {
2017 int res;
2018 unsigned long long bufID;
2019 unsigned char *dbuf = reg->base;
2020
2021 res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2022 (CUdeviceptr)dbuf);
2023
2024
2025 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2026 opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2027 true, OPAL_PROC_MY_HOSTNAME, res);
2028 return true;
2029 }
2030 opal_output_verbose(50, mca_common_cuda_output,
2031 "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, reg->gpu_bufID,
2032 (reg->gpu_bufID == bufID ? "BUFFER_ID match":"BUFFER_ID do not match"));
2033 if (bufID != reg->gpu_bufID) {
2034 return true;
2035 } else {
2036 return false;
2037 }
2038 }
2039
2040
2041
2042
2043
2044
2045
2046
2047 void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
2048 {
2049 int res;
2050 unsigned long long bufID = 0;
2051 unsigned char *dbuf = reg->base;
2052 int enable = 1;
2053
2054 res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2055 (CUdeviceptr)dbuf);
2056 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2057 opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2058 true, OPAL_PROC_MY_HOSTNAME, res);
2059 }
2060 reg->gpu_bufID = bufID;
2061
2062 res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
2063 (CUdeviceptr)dbuf);
2064 if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) {
2065 opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
2066 true, OPAL_PROC_MY_HOSTNAME, res, dbuf);
2067 }
2068 }
2069 #endif