This source file includes following definitions.
- mca_btl_smcuda_param_register_int
- mca_btl_smcuda_param_register_uint
- mca_btl_smcuda_component_verify
- smcuda_register
- mca_btl_smcuda_component_open
- mca_btl_smcuda_component_close
- get_num_local_procs
- calc_sm_max_procs
- create_and_attach
- get_mpool_res_size
- set_uniq_paths_for_init_rndv
- create_rndv_file
- backing_store_init
- mca_btl_smcuda_send_cuda_ipc_ack
- btl_smcuda_control
- mca_btl_smcuda_component_init
- mca_btl_smcuda_component_event_thread
- btl_smcuda_process_pending_sends
- mca_btl_smcuda_component_progress
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include "opal_config.h"
27 #include <errno.h>
28 #ifdef HAVE_UNISTD_H
29 #include <unistd.h>
30 #endif
31 #include <string.h>
32 #ifdef HAVE_FCNTL_H
33 #include <fcntl.h>
34 #endif
35 #ifdef HAVE_SYS_TYPES_H
36 #include <sys/types.h>
37 #endif
38 #ifdef HAVE_SYS_MMAN_H
39 #include <sys/mman.h>
40 #endif
41 #ifdef HAVE_SYS_STAT_H
42 #include <sys/stat.h>
43 #endif
44
45 #include "opal/mca/shmem/base/base.h"
46 #include "opal/mca/shmem/shmem.h"
47 #include "opal/util/bit_ops.h"
48 #include "opal/util/output.h"
49 #include "opal/util/show_help.h"
50 #include "opal/util/printf.h"
51
52 #include "opal/mca/mpool/base/base.h"
53 #include "opal/mca/common/sm/common_sm.h"
54 #include "opal/mca/btl/base/btl_base_error.h"
55 #include "opal/runtime/opal_params.h"
56
57 #if OPAL_CUDA_SUPPORT
58 #include "opal/mca/common/cuda/common_cuda.h"
59 #endif
60 #if OPAL_ENABLE_FT_CR == 1
61 #include "opal/runtime/opal_cr.h"
62 #endif
63
64 #include "btl_smcuda.h"
65 #include "btl_smcuda_frag.h"
66 #include "btl_smcuda_fifo.h"
67
68 static int mca_btl_smcuda_component_open(void);
69 static int mca_btl_smcuda_component_close(void);
70 static int smcuda_register(void);
71 static mca_btl_base_module_t** mca_btl_smcuda_component_init(
72 int *num_btls,
73 bool enable_progress_threads,
74 bool enable_mpi_threads
75 );
76
77 typedef enum {
78 MCA_BTL_SM_RNDV_MOD_SM = 0,
79 MCA_BTL_SM_RNDV_MOD_MPOOL
80 } mca_btl_sm_rndv_module_type_t;
81
82
83
84
85 mca_btl_smcuda_component_t mca_btl_smcuda_component = {
86 .super = {
87
88
89 .btl_version = {
90 MCA_BTL_DEFAULT_VERSION("smcuda"),
91 .mca_open_component = mca_btl_smcuda_component_open,
92 .mca_close_component = mca_btl_smcuda_component_close,
93 .mca_register_component_params = smcuda_register,
94 },
95 .btl_data = {
96
97 .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
98 },
99
100 .btl_init = mca_btl_smcuda_component_init,
101 .btl_progress = mca_btl_smcuda_component_progress,
102 }
103 };
104
105
106
107
108
109
110 static inline int mca_btl_smcuda_param_register_int(
111 const char* param_name,
112 int default_value,
113 int level,
114 int *storage)
115 {
116 *storage = default_value;
117 (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
118 param_name, NULL, MCA_BASE_VAR_TYPE_INT,
119 NULL, 0, 0, level,
120 MCA_BASE_VAR_SCOPE_READONLY, storage);
121 return *storage;
122 }
123
124 static inline unsigned int mca_btl_smcuda_param_register_uint(
125 const char* param_name,
126 unsigned int default_value,
127 int level,
128 unsigned int *storage)
129 {
130 *storage = default_value;
131 (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
132 param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
133 NULL, 0, 0, level,
134 MCA_BASE_VAR_SCOPE_READONLY, storage);
135 return *storage;
136 }
137
138 static int mca_btl_smcuda_component_verify(void) {
139
140 return mca_btl_base_param_verify(&mca_btl_smcuda.super);
141 }
142
143 static int smcuda_register(void)
144 {
145
146 mca_btl_smcuda_component.mpool_min_size = 134217728;
147 (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "min_size",
148 "Minimum size of the common/sm mpool shared memory file",
149 MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0,
150 OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
151 &mca_btl_smcuda_component.mpool_min_size);
152
153 mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
154 mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
155 mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
156 mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
157
158
159 mca_btl_smcuda_component.sm_mpool_name = "sm";
160 mca_btl_smcuda_param_register_uint("fifo_size", 4096, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.fifo_size);
161 mca_btl_smcuda_param_register_int("num_fifos", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.nfifos);
162
163 mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.fifo_lazy_free);
164
165
166 mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9, &mca_btl_smcuda_component.sm_extra_procs);
167
168 mca_btl_smcuda_component.allocator = "bucket";
169 (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version, "allocator",
170 "Name of allocator component to use for btl/smcuda allocations",
171 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
172 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_smcuda_component.allocator);
173
174 #if OPAL_CUDA_SUPPORT
175
176 if (opal_cuda_support) {
177 mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH+1;
178 } else {
179 mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
180 }
181 mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
182 mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
183 mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
184 mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
185 opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
186 #else
187 mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
188 #endif
189 mca_btl_smcuda.super.btl_eager_limit = 4*1024;
190 mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
191 mca_btl_smcuda.super.btl_max_send_size = 32*1024;
192 mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
193 mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
194 mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
195 mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
196 mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
197 mca_btl_smcuda.super.btl_bandwidth = 9000;
198 mca_btl_smcuda.super.btl_latency = 1;
199
200
201 mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
202 &mca_btl_smcuda.super);
203 #if OPAL_CUDA_SUPPORT
204
205 if (0 == mca_btl_smcuda.super.btl_cuda_max_send_size) {
206 mca_btl_smcuda.super.btl_cuda_max_send_size = 128*1024;
207 }
208
209
210 if (0 == mca_btl_smcuda.super.btl_cuda_eager_limit) {
211 mca_btl_smcuda.super.btl_cuda_eager_limit = SIZE_MAX;
212 }
213 mca_common_cuda_register_mca_variables();
214 #endif
215 return mca_btl_smcuda_component_verify();
216 }
217
218
219
220
221
222
223 static int mca_btl_smcuda_component_open(void)
224 {
225 if (OPAL_SUCCESS != mca_btl_smcuda_component_verify()) {
226 return OPAL_ERROR;
227 }
228
229 mca_btl_smcuda_component.sm_max_btls = 1;
230
231
232 mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
233
234
235 if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
236 mca_btl_smcuda_component.fifo_lazy_free = (mca_btl_smcuda_component.fifo_size >> 1);
237 if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
238 mca_btl_smcuda_component.fifo_lazy_free = 1;
239
240 mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
241 mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
242
243 #if OPAL_CUDA_SUPPORT
244
245 if (mca_btl_smcuda.super.btl_cuda_max_send_size > mca_btl_smcuda.super.btl_max_send_size) {
246 mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_cuda_max_send_size;
247 }
248 opal_output_verbose(10, opal_btl_base_framework.framework_output,
249 "btl: smcuda: cuda_max_send_size=%d, max_send_size=%d, max_frag_size=%d",
250 (int)mca_btl_smcuda.super.btl_cuda_max_send_size, (int)mca_btl_smcuda.super.btl_max_send_size,
251 (int)mca_btl_smcuda_component.max_frag_size);
252 #endif
253
254
255 OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
256 OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, opal_free_list_t);
257 OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, opal_free_list_t);
258 OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t);
259 OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
260 return OPAL_SUCCESS;
261 }
262
263
264
265
266
267
268 static int mca_btl_smcuda_component_close(void)
269 {
270 int return_value = OPAL_SUCCESS;
271
272
273 OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
274
275
276
277
278
279
280
281
282
283 if(mca_btl_smcuda_component.sm_seg != NULL) {
284 return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
285 if( OPAL_SUCCESS != return_value ) {
286 return_value = OPAL_ERROR;
287 opal_output(0," mca_common_sm_fini failed\n");
288 goto CLEANUP;
289 }
290
291
292
293
294
295 #if OPAL_ENABLE_FT_CR == 1
296
297
298
299 if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
300 OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
301 unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
302 }
303 #else
304 unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
305 #endif
306 OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
307 }
308
309 #if OPAL_ENABLE_PROGRESS_THREADS == 1
310
311 if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
312
313 unsigned char cmd = DONE;
314 if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
315 sizeof(cmd)){
316 opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
317 errno);
318 }
319 opal_thread_join(&mca_btl_smcuda_component.sm_fifo_thread, NULL);
320 close(mca_btl_smcuda_component.sm_fifo_fd);
321 unlink(mca_btl_smcuda_component.sm_fifo_path);
322 }
323 #endif
324
325 CLEANUP:
326
327 #if OPAL_CUDA_SUPPORT
328 mca_common_cuda_fini();
329 #endif
330
331
332 return return_value;
333 }
334
335
336
337
338 static inline int
339 get_num_local_procs(void)
340 {
341
342
343 return (int)(1 + opal_process_info.num_local_peers);
344 }
345
346 static void
347 calc_sm_max_procs(int n)
348 {
349
350 if (0 > mca_btl_smcuda_component.sm_max_procs) {
351
352 if (0 <= mca_btl_smcuda_component.sm_extra_procs) {
353
354 mca_btl_smcuda_component.sm_max_procs =
355 n + mca_btl_smcuda_component.sm_extra_procs;
356 } else {
357
358 mca_btl_smcuda_component.sm_max_procs = 2 * n;
359 }
360 }
361 }
362
363 static int
364 create_and_attach(mca_btl_smcuda_component_t *comp_ptr,
365 size_t size,
366 char *file_name,
367 size_t size_ctl_structure,
368 size_t data_seg_alignment,
369 mca_common_sm_module_t **out_modp)
370
371 {
372 if (NULL == (*out_modp =
373 mca_common_sm_module_create_and_attach(size, file_name,
374 size_ctl_structure,
375 data_seg_alignment))) {
376 opal_output(0, "create_and_attach: unable to create shared memory "
377 "BTL coordinating strucure :: size %lu \n",
378 (unsigned long)size);
379 return OPAL_ERROR;
380 }
381 return OPAL_SUCCESS;
382 }
383
384 static int
385 get_mpool_res_size(int32_t max_procs,
386 size_t *out_res_size)
387 {
388 size_t size = 0;
389
390 *out_res_size = 0;
391
392
393
394
395
396
397
398
399
400
401
402
403
404 size = FIFO_MAP_NUM(max_procs) *
405 (sizeof(sm_fifo_t) + sizeof(void *) *
406 mca_btl_smcuda_component.fifo_size + 4 * opal_cache_line_size) +
407 (2 * max_procs + mca_btl_smcuda_component.sm_free_list_inc) *
408 (mca_btl_smcuda_component.eager_limit + 2 * opal_cache_line_size) +
409 mca_btl_smcuda_component.sm_free_list_num *
410 (mca_btl_smcuda_component.max_frag_size + 2 * opal_cache_line_size);
411
412
413 size += sizeof(mca_common_sm_module_t);
414
415
416
417
418
419
420
421 if (((double)size) * max_procs > LONG_MAX - 4096) {
422 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
423 }
424 size *= (size_t)max_procs;
425 *out_res_size = size;
426 return OPAL_SUCCESS;
427 }
428
429
430
431
432 static int
433 set_uniq_paths_for_init_rndv(mca_btl_smcuda_component_t *comp_ptr)
434 {
435 int rc = OPAL_ERR_OUT_OF_RESOURCE;
436
437
438 comp_ptr->sm_mpool_ctl_file_name = NULL;
439 comp_ptr->sm_mpool_rndv_file_name = NULL;
440 comp_ptr->sm_ctl_file_name = NULL;
441 comp_ptr->sm_rndv_file_name = NULL;
442
443 if (opal_asprintf(&comp_ptr->sm_mpool_ctl_file_name,
444 "%s"OPAL_PATH_SEP"shared_mem_cuda_pool.%s",
445 opal_process_info.job_session_dir,
446 opal_process_info.nodename) < 0) {
447
448 goto out;
449 }
450 if (opal_asprintf(&comp_ptr->sm_mpool_rndv_file_name,
451 "%s"OPAL_PATH_SEP"shared_mem_cuda_pool_rndv.%s",
452 opal_process_info.job_session_dir,
453 opal_process_info.nodename) < 0) {
454
455 goto out;
456 }
457 if (opal_asprintf(&comp_ptr->sm_ctl_file_name,
458 "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_module.%s",
459 opal_process_info.job_session_dir,
460 opal_process_info.nodename) < 0) {
461
462 goto out;
463 }
464 if (opal_asprintf(&comp_ptr->sm_rndv_file_name,
465 "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_rndv.%s",
466 opal_process_info.job_session_dir,
467 opal_process_info.nodename) < 0) {
468
469 goto out;
470 }
471
472 rc = OPAL_SUCCESS;
473
474 out:
475 if (OPAL_SUCCESS != rc) {
476 if (comp_ptr->sm_mpool_ctl_file_name) {
477 free(comp_ptr->sm_mpool_ctl_file_name);
478 }
479 if (comp_ptr->sm_mpool_rndv_file_name) {
480 free(comp_ptr->sm_mpool_rndv_file_name);
481 }
482 if (comp_ptr->sm_ctl_file_name) {
483 free(comp_ptr->sm_ctl_file_name);
484 }
485 if (comp_ptr->sm_rndv_file_name) {
486 free(comp_ptr->sm_rndv_file_name);
487 }
488 }
489 return rc;
490 }
491
492 static int
493 create_rndv_file(mca_btl_smcuda_component_t *comp_ptr,
494 mca_btl_sm_rndv_module_type_t type)
495 {
496 size_t size = 0;
497 int rc = OPAL_SUCCESS;
498 int fd = -1;
499 char *fname = NULL;
500
501 mca_common_sm_module_t *tmp_modp = NULL;
502
503 if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
504
505 if (OPAL_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs,
506 &size))) {
507
508 goto out;
509 }
510
511
512 if (size < mca_btl_smcuda_component.mpool_min_size) {
513 size = mca_btl_smcuda_component.mpool_min_size;
514 }
515
516
517
518
519 if (OPAL_SUCCESS != (rc =
520 create_and_attach(comp_ptr, size, comp_ptr->sm_mpool_ctl_file_name,
521 sizeof(mca_common_sm_module_t), 8, &tmp_modp))) {
522
523 goto out;
524 }
525 fname = comp_ptr->sm_mpool_rndv_file_name;
526 }
527 else if (MCA_BTL_SM_RNDV_MOD_SM == type) {
528
529 size = sizeof(mca_common_sm_seg_header_t) +
530 comp_ptr->sm_max_procs *
531 (sizeof(sm_fifo_t *) +
532 sizeof(char *) + sizeof(uint16_t)) +
533 opal_cache_line_size;
534
535 if (OPAL_SUCCESS != (rc =
536 create_and_attach(comp_ptr, size, comp_ptr->sm_ctl_file_name,
537 sizeof(mca_common_sm_seg_header_t),
538 opal_cache_line_size, &comp_ptr->sm_seg))) {
539
540 goto out;
541 }
542 fname = comp_ptr->sm_rndv_file_name;
543 tmp_modp = comp_ptr->sm_seg;
544 }
545 else {
546 return OPAL_ERR_BAD_PARAM;
547 }
548
549
550
551
552
553
554 if (-1 == (fd = open(fname, O_CREAT | O_RDWR, 0600))) {
555 int err = errno;
556 opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
557 "open(2)", strerror(err), err);
558 rc = OPAL_ERR_IN_ERRNO;
559 goto out;
560 }
561 if ((ssize_t)sizeof(opal_shmem_ds_t) != write(fd, &(tmp_modp->shmem_ds),
562 sizeof(opal_shmem_ds_t))) {
563 int err = errno;
564 opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
565 "write(2)", strerror(err), err);
566 rc = OPAL_ERR_IN_ERRNO;
567 goto out;
568 }
569 if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
570 if ((ssize_t)sizeof(size) != write(fd, &size, sizeof(size))) {
571 int err = errno;
572 opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
573 "write(2)", strerror(err), err);
574 rc = OPAL_ERR_IN_ERRNO;
575 goto out;
576 }
577
578 OBJ_RELEASE(tmp_modp);
579 }
580
581 out:
582 if (-1 != fd) {
583 (void)close(fd);
584 }
585 return rc;
586 }
587
588
589
590
591 static int
592 backing_store_init(mca_btl_smcuda_component_t *comp_ptr,
593 uint32_t local_rank)
594 {
595 int rc = OPAL_SUCCESS;
596
597 if (OPAL_SUCCESS != (rc = set_uniq_paths_for_init_rndv(comp_ptr))) {
598 goto out;
599 }
600
601 if (0 == local_rank) {
602
603 if (OPAL_SUCCESS != (rc =
604 create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_MPOOL))) {
605 goto out;
606 }
607
608 if (OPAL_SUCCESS != (rc =
609 create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_SM))) {
610 goto out;
611 }
612 }
613
614 out:
615 return rc;
616 }
617
618 #if OPAL_CUDA_SUPPORT
619
620
621
622
623
624
625
626
627 static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl,
628 struct mca_btl_base_endpoint_t* endpoint, int ready)
629 {
630 mca_btl_smcuda_frag_t* frag;
631 ctrlhdr_t ctrlhdr;
632 int rc;
633
634 if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
635 mca_btl_smcuda_component_progress();
636 }
637
638
639 MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
640 if( OPAL_UNLIKELY(NULL == frag) ) {
641 endpoint->ipcstate = IPC_BAD;
642 return;
643 }
644
645 if (ready) {
646 ctrlhdr.ctag = IPC_ACK;
647 } else {
648 ctrlhdr.ctag = IPC_NOTREADY;
649 }
650
651
652 frag->hdr->tag = MCA_BTL_TAG_SMCUDA;
653 frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
654 frag->endpoint = endpoint;
655 memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
656
657
658
659
660
661
662
663 OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
664
665 MCA_BTL_SMCUDA_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
666 endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
667
668
669 if (ready) {
670 endpoint->ipcstate = IPC_ACKED;
671 } else {
672 endpoint->ipcstate = IPC_INIT;
673 }
674
675 return;
676
677 }
678
679
680
681 static void btl_smcuda_control(mca_btl_base_module_t* btl,
682 mca_btl_base_tag_t tag,
683 mca_btl_base_descriptor_t* des, void* cbdata)
684 {
685 int mydevnum, ipcaccess, res;
686 ctrlhdr_t ctrlhdr;
687 opal_proc_t *ep_proc;
688 struct mca_btl_base_endpoint_t *endpoint;
689 mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
690 mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
691 mca_btl_base_segment_t* segments = des->des_segments;
692
693
694
695 endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
696 ep_proc = endpoint->proc_opal;
697
698
699 memcpy(&ctrlhdr, segments->seg_addr.pval, sizeof(struct ctrlhdr_st));
700
701
702 switch (ctrlhdr.ctag) {
703 case IPC_REQ:
704
705
706
707
708
709
710
711 OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
712 if ((IPC_INIT == endpoint->ipcstate) ||
713 ((IPC_SENT == endpoint->ipcstate) && (endpoint->my_smp_rank > endpoint->peer_smp_rank))) {
714 endpoint->ipcstate = IPC_ACKING;
715 OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
716
717
718 if (!mca_common_cuda_enabled) {
719 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
720 "Sending CUDA IPC NOTREADY: myrank=%d, peerrank=%d",
721 mca_btl_smcuda_component.my_smp_rank,
722 endpoint->peer_smp_rank);
723 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 0);
724 return;
725 }
726
727
728
729 res = mca_common_cuda_get_device(&mydevnum);
730 if (0 != res) {
731 endpoint->ipcstate = IPC_BAD;
732 return;
733 }
734
735
736
737
738
739
740
741
742 if (mydevnum == ctrlhdr.cudev) {
743 if (mca_btl_smcuda_component.use_cuda_ipc_same_gpu) {
744 ipcaccess = 1;
745 } else {
746 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
747 "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
748 "peerdev=%d --> Access is disabled by btl_smcuda_use_cuda_ipc_same_gpu",
749 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
750 ctrlhdr.cudev);
751 endpoint->ipcstate = IPC_BAD;
752 return;
753 }
754 } else {
755 res = mca_common_cuda_device_can_access_peer(&ipcaccess, mydevnum, ctrlhdr.cudev);
756 if (0 != res) {
757 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
758 "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
759 "peerdev=%d --> Access is disabled because peer check failed with err=%d",
760 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
761 ctrlhdr.cudev, res);
762 endpoint->ipcstate = IPC_BAD;
763 return;
764 }
765 }
766
767 assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank);
768 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
769 "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
770 "peerdev=%d --> ACCESS=%d",
771 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
772 ctrlhdr.cudev, ipcaccess);
773
774 if (0 == ipcaccess) {
775
776 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
777 "Not sending CUDA IPC ACK, no P2P support");
778 endpoint->ipcstate = IPC_BAD;
779 } else {
780
781 smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
782 ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
783 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
784 "Sending CUDA IPC ACK: myrank=%d, mydev=%d, peerrank=%d, peerdev=%d",
785 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
786 ctrlhdr.cudev);
787 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 1);
788 }
789 } else {
790 OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
791 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
792 "Not sending CUDA IPC ACK because request already initiated");
793 }
794 break;
795
796 case IPC_ACK:
797 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
798 "Received CUDA IPC ACK, notifying PML: myrank=%d, peerrank=%d",
799 endpoint->my_smp_rank, endpoint->peer_smp_rank);
800
801 smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
802 ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
803 assert(endpoint->ipcstate == IPC_SENT);
804 endpoint->ipcstate = IPC_ACKED;
805 break;
806
807 case IPC_NOTREADY:
808
809
810 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
811 "Received CUDA IPC NOTREADY, reset state to allow another attempt: "
812 "myrank=%d, peerrank=%d",
813 endpoint->my_smp_rank, endpoint->peer_smp_rank);
814 OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
815 if (IPC_SENT == endpoint->ipcstate) {
816 endpoint->ipcstate = IPC_INIT;
817 }
818 OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
819 break;
820
821 default:
822 opal_output(0, "Received UNKNOWN CUDA IPC control message. This should not happen.");
823 }
824 }
825
826 #endif
827
828
829
830
831 static mca_btl_base_module_t **
832 mca_btl_smcuda_component_init(int *num_btls,
833 bool enable_progress_threads,
834 bool enable_mpi_threads)
835 {
836 int num_local_procs = 0;
837 mca_btl_base_module_t **btls = NULL;
838 uint32_t my_local_rank = UINT32_MAX;
839
840 *num_btls = 0;
841
842 mca_btl_smcuda_component.sm_mpool = NULL;
843 mca_btl_smcuda_component.sm_mpool_base = NULL;
844
845 #if OPAL_CUDA_SUPPORT
846 mca_common_cuda_stage_one_init();
847 #endif
848
849
850 if (NULL == opal_process_info.job_session_dir) {
851
852
853
854 return NULL;
855 }
856
857
858
859
860
861
862 if (UINT32_MAX ==
863 (my_local_rank = opal_process_info.my_local_rank)) {
864 opal_show_help("help-mpi-btl-smcuda.txt", "no locality", true);
865 return NULL;
866 }
867
868 if ((num_local_procs = get_num_local_procs()) < 2) {
869 return NULL;
870 }
871
872
873 calc_sm_max_procs(num_local_procs);
874
875
876
877
878
879 if (OPAL_SUCCESS != backing_store_init(&mca_btl_smcuda_component,
880 my_local_rank)) {
881 return NULL;
882 }
883
884 #if OPAL_ENABLE_PROGRESS_THREADS == 1
885
886 sprintf( mca_btl_smcuda_component.sm_fifo_path,
887 "%s"OPAL_PATH_SEP"sm_fifo.%lu", opal_process_info.job_session_dir,
888 (unsigned long)OPAL_PROC_MY_NAME->vpid );
889 if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
890 opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
891 return NULL;
892 }
893 mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path,
894 O_RDWR);
895 if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
896 opal_output(0, "mca_btl_smcuda_component_init: "
897 "open(%s) failed with errno=%d\n",
898 mca_btl_smcuda_component.sm_fifo_path, errno);
899 return NULL;
900 }
901
902 OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
903 mca_btl_smcuda_component.sm_fifo_thread.t_run =
904 (opal_thread_fn_t)mca_btl_smcuda_component_event_thread;
905 opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
906 #endif
907
908 mca_btl_smcuda_component.sm_btls =
909 (mca_btl_smcuda_t **)malloc(mca_btl_smcuda_component.sm_max_btls *
910 sizeof(mca_btl_smcuda_t *));
911 if (NULL == mca_btl_smcuda_component.sm_btls) {
912 return NULL;
913 }
914
915
916 *num_btls = 1;
917 btls = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*));
918 if (NULL == btls) {
919 return NULL;
920 }
921
922
923 btls[0] = (mca_btl_base_module_t*)(&(mca_btl_smcuda));
924 mca_btl_smcuda_component.sm_btls[0] = (mca_btl_smcuda_t*)(&(mca_btl_smcuda));
925
926
927
928 mca_btl_smcuda_component.num_smp_procs = 0;
929 mca_btl_smcuda_component.my_smp_rank = -1;
930 mca_btl_smcuda_component.sm_num_btls = 1;
931
932 mca_btl_smcuda.btl_inited = false;
933
934 #if OPAL_CUDA_SUPPORT
935
936 mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
937
938 mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
939 mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
940 #endif
941
942 return btls;
943
944 }
945
946
947
948
949
950
951 #if OPAL_ENABLE_PROGRESS_THREADS == 1
952 void mca_btl_smcuda_component_event_thread(opal_object_t* thread)
953 {
954 while(1) {
955 unsigned char cmd;
956 if(read(mca_btl_smcuda_component.sm_fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) {
957
958 return;
959 }
960 if( DONE == cmd ){
961
962 return;
963 }
964 mca_btl_smcuda_component_progress();
965 }
966 }
967 #endif
968
969 void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep)
970 {
971 btl_smcuda_pending_send_item_t *si;
972 int rc;
973
974 while ( 0 < opal_list_get_size(&ep->pending_sends) ) {
975
976
977
978
979 OPAL_THREAD_LOCK(&ep->endpoint_lock);
980 si = (btl_smcuda_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
981 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
982
983 if(NULL == si) return;
984
985 OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_pending_sends, -1);
986
987 MCA_BTL_SMCUDA_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
988 true, false, rc);
989
990 opal_free_list_return (&mca_btl_smcuda_component.pending_send_fl, (opal_free_list_item_t*)si);
991
992 if ( OPAL_SUCCESS != rc )
993 return;
994 }
995 }
996
997 int mca_btl_smcuda_component_progress(void)
998 {
999
1000 mca_btl_base_segment_t seg;
1001 mca_btl_smcuda_frag_t *frag;
1002 mca_btl_smcuda_frag_t Frag;
1003 sm_fifo_t *fifo = NULL;
1004 mca_btl_smcuda_hdr_t *hdr;
1005 int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
1006 int peer_smp_rank, j, rc = 0, nevents = 0;
1007
1008
1009
1010 if ( 0 < mca_btl_smcuda_component.num_pending_sends ) {
1011
1012
1013
1014 for ( peer_smp_rank = 0; peer_smp_rank < mca_btl_smcuda_component.num_smp_procs; peer_smp_rank++) {
1015 struct mca_btl_base_endpoint_t* endpoint;
1016 if ( peer_smp_rank == my_smp_rank )
1017 continue;
1018 endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank];
1019 if ( 0 < opal_list_get_size(&endpoint->pending_sends) )
1020 btl_smcuda_process_pending_sends(endpoint);
1021 }
1022 }
1023
1024
1025 for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
1026 fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
1027 recheck_peer:
1028
1029 if(opal_using_threads()) {
1030 opal_atomic_lock(&(fifo->tail_lock));
1031 }
1032
1033 hdr = (mca_btl_smcuda_hdr_t *)sm_fifo_read(fifo);
1034
1035
1036 if(opal_using_threads()) {
1037 opal_atomic_unlock(&(fifo->tail_lock));
1038 }
1039
1040 if(SM_FIFO_FREE == hdr) {
1041 continue;
1042 }
1043
1044 nevents++;
1045
1046 switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
1047 case MCA_BTL_SMCUDA_FRAG_SEND:
1048 {
1049 mca_btl_active_message_callback_t* reg;
1050
1051
1052 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1053 peer_smp_rank = hdr->my_smp_rank;
1054 #if OPAL_ENABLE_DEBUG
1055 if ( FIFO_MAP(peer_smp_rank) != j ) {
1056 opal_output(0, "mca_btl_smcuda_component_progress: "
1057 "rank %d got %d on FIFO %d, but this sender should send to FIFO %d\n",
1058 my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
1059 }
1060 #endif
1061
1062 reg = mca_btl_base_active_message_trigger + hdr->tag;
1063 seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
1064 seg.seg_len = hdr->len;
1065 Frag.base.des_segment_count = 1;
1066 Frag.base.des_segments = &seg;
1067 #if OPAL_CUDA_SUPPORT
1068 Frag.hdr = hdr;
1069 #endif
1070 reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
1071 reg->cbdata);
1072
1073 MCA_BTL_SMCUDA_FIFO_WRITE(
1074 mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1075 my_smp_rank, peer_smp_rank, hdr->frag, false, true, rc);
1076 break;
1077 }
1078 case MCA_BTL_SMCUDA_FRAG_ACK:
1079 {
1080 int status = (uintptr_t)hdr & MCA_BTL_SMCUDA_FRAG_STATUS_MASK;
1081 int btl_ownership;
1082 struct mca_btl_base_endpoint_t* endpoint;
1083
1084 frag = (mca_btl_smcuda_frag_t *)((char*)((uintptr_t)hdr &
1085 (~(MCA_BTL_SMCUDA_FRAG_TYPE_MASK |
1086 MCA_BTL_SMCUDA_FRAG_STATUS_MASK))));
1087
1088 endpoint = frag->endpoint;
1089 btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
1090 if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
1091
1092 frag->base.des_cbfunc(&mca_btl_smcuda.super, frag->endpoint,
1093 &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
1094 }
1095 if( btl_ownership ) {
1096 MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1097 }
1098 OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
1099 if ( 0 < opal_list_get_size(&endpoint->pending_sends) ) {
1100 btl_smcuda_process_pending_sends(endpoint);
1101 }
1102 goto recheck_peer;
1103 }
1104 default:
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116 opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1117 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1118 peer_smp_rank = hdr->my_smp_rank;
1119 hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1120 MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1121 MCA_BTL_SMCUDA_FIFO_WRITE(
1122 mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1123 my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1124 break;
1125 }
1126 }
1127 (void)rc;
1128
1129 #if OPAL_CUDA_SUPPORT
1130
1131
1132
1133 while (1 == progress_one_cuda_ipc_event((mca_btl_base_descriptor_t **)&frag)) {
1134 mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag->base.des_cbfunc;
1135
1136 cbfunc (&mca_btl_smcuda.super, frag->endpoint, frag->segment.seg_addr.pval,
1137 frag->local_handle, frag->base.des_context, frag->base.des_cbdata,
1138 OPAL_SUCCESS);
1139
1140 if(frag->registration != NULL) {
1141 frag->endpoint->rcache->rcache_deregister (frag->endpoint->rcache,
1142 (mca_rcache_base_registration_t*)frag->registration);
1143 frag->registration = NULL;
1144 MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1145 }
1146 nevents++;
1147 }
1148 #endif
1149 return nevents;
1150 }