This source file includes following definitions.
- mca_btl_smcuda_param_register_int
- mca_btl_smcuda_param_register_uint
- mca_btl_smcuda_component_verify
- smcuda_register
- mca_btl_smcuda_component_open
- mca_btl_smcuda_component_close
- get_num_local_procs
- calc_sm_max_procs
- create_and_attach
- get_mpool_res_size
- set_uniq_paths_for_init_rndv
- create_rndv_file
- backing_store_init
- mca_btl_smcuda_send_cuda_ipc_ack
- btl_smcuda_control
- mca_btl_smcuda_component_init
- mca_btl_smcuda_component_event_thread
- btl_smcuda_process_pending_sends
- mca_btl_smcuda_component_progress
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 #include "opal_config.h"
  27 #include <errno.h>
  28 #ifdef HAVE_UNISTD_H
  29 #include <unistd.h>
  30 #endif  
  31 #include <string.h>
  32 #ifdef HAVE_FCNTL_H
  33 #include <fcntl.h>
  34 #endif  
  35 #ifdef HAVE_SYS_TYPES_H
  36 #include <sys/types.h>
  37 #endif  
  38 #ifdef HAVE_SYS_MMAN_H
  39 #include <sys/mman.h>
  40 #endif  
  41 #ifdef HAVE_SYS_STAT_H
  42 #include <sys/stat.h>  
  43 #endif  
  44 
  45 #include "opal/mca/shmem/base/base.h"
  46 #include "opal/mca/shmem/shmem.h"
  47 #include "opal/util/bit_ops.h"
  48 #include "opal/util/output.h"
  49 #include "opal/util/show_help.h"
  50 #include "opal/util/printf.h"
  51 
  52 #include "opal/mca/mpool/base/base.h"
  53 #include "opal/mca/common/sm/common_sm.h"
  54 #include "opal/mca/btl/base/btl_base_error.h"
  55 #include "opal/runtime/opal_params.h"
  56 
  57 #if OPAL_CUDA_SUPPORT
  58 #include "opal/mca/common/cuda/common_cuda.h"
  59 #endif 
  60 #if OPAL_ENABLE_FT_CR    == 1
  61 #include "opal/runtime/opal_cr.h"
  62 #endif
  63 
  64 #include "btl_smcuda.h"
  65 #include "btl_smcuda_frag.h"
  66 #include "btl_smcuda_fifo.h"
  67 
  68 static int mca_btl_smcuda_component_open(void);
  69 static int mca_btl_smcuda_component_close(void);
  70 static int smcuda_register(void);
  71 static mca_btl_base_module_t** mca_btl_smcuda_component_init(
  72     int *num_btls,
  73     bool enable_progress_threads,
  74     bool enable_mpi_threads
  75 );
  76 
  77 typedef enum {
  78     MCA_BTL_SM_RNDV_MOD_SM = 0,
  79     MCA_BTL_SM_RNDV_MOD_MPOOL
  80 } mca_btl_sm_rndv_module_type_t;
  81 
  82 
  83 
  84 
  85 mca_btl_smcuda_component_t mca_btl_smcuda_component = {
  86     .super = {
  87         
  88 
  89         .btl_version = {
  90             MCA_BTL_DEFAULT_VERSION("smcuda"),
  91             .mca_open_component = mca_btl_smcuda_component_open,
  92             .mca_close_component = mca_btl_smcuda_component_close,
  93             .mca_register_component_params = smcuda_register,
  94         },
  95         .btl_data = {
  96             
  97             .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
  98         },
  99 
 100         .btl_init = mca_btl_smcuda_component_init,
 101         .btl_progress = mca_btl_smcuda_component_progress,
 102     }  
 103 };
 104 
 105 
 106 
 107 
 108 
 109 
 110 static inline int mca_btl_smcuda_param_register_int(
 111     const char* param_name,
 112     int default_value,
 113     int level,
 114     int *storage)
 115 {
 116     *storage = default_value;
 117     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
 118                                             param_name, NULL, MCA_BASE_VAR_TYPE_INT,
 119                                             NULL, 0, 0, level,
 120                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
 121     return *storage;
 122 }
 123 
 124 static inline unsigned int mca_btl_smcuda_param_register_uint(
 125     const char* param_name,
 126     unsigned int default_value,
 127     int level,
 128     unsigned int *storage)
 129 {
 130     *storage = default_value;
 131     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
 132                                             param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
 133                                             NULL, 0, 0, level,
 134                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
 135     return *storage;
 136 }
 137 
 138 static int mca_btl_smcuda_component_verify(void) {
 139 
 140     return mca_btl_base_param_verify(&mca_btl_smcuda.super);
 141 }
 142 
 143 static int smcuda_register(void)
 144 {
 145     
 146     mca_btl_smcuda_component.mpool_min_size = 134217728;
 147     (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "min_size",
 148                                            "Minimum size of the common/sm mpool shared memory file",
 149                                            MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0,
 150                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
 151                                            &mca_btl_smcuda_component.mpool_min_size);
 152 
 153     mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
 154     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
 155     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
 156     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
 157     
 158 
 159     mca_btl_smcuda_component.sm_mpool_name = "sm";
 160     mca_btl_smcuda_param_register_uint("fifo_size", 4096, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.fifo_size);
 161     mca_btl_smcuda_param_register_int("num_fifos", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.nfifos);
 162 
 163     mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.fifo_lazy_free);
 164 
 165     
 166     mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9, &mca_btl_smcuda_component.sm_extra_procs);
 167 
 168     mca_btl_smcuda_component.allocator = "bucket";
 169     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version, "allocator",
 170                                             "Name of allocator component to use for btl/smcuda allocations",
 171                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
 172                                             MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_smcuda_component.allocator);
 173 
 174 #if OPAL_CUDA_SUPPORT
 175     
 176     if (opal_cuda_support) {
 177         mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH+1;
 178     } else {
 179         mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 180     }
 181     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
 182     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
 183     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
 184     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
 185     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 186 #else 
 187     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 188 #endif 
 189     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
 190     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
 191     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
 192     mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
 193     mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
 194     mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
 195     mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
 196     mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
 197     mca_btl_smcuda.super.btl_bandwidth = 9000;  
 198     mca_btl_smcuda.super.btl_latency   = 1;     
 199 
 200     
 201     mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
 202                                 &mca_btl_smcuda.super);
 203 #if OPAL_CUDA_SUPPORT
 204     
 205     if (0 == mca_btl_smcuda.super.btl_cuda_max_send_size) {
 206         mca_btl_smcuda.super.btl_cuda_max_send_size = 128*1024;
 207     }
 208     
 209 
 210     if (0 == mca_btl_smcuda.super.btl_cuda_eager_limit) {
 211         mca_btl_smcuda.super.btl_cuda_eager_limit = SIZE_MAX; 
 212     }
 213     mca_common_cuda_register_mca_variables();
 214 #endif 
 215     return mca_btl_smcuda_component_verify();
 216 }
 217 
 218 
 219 
 220 
 221 
 222 
 223 static int mca_btl_smcuda_component_open(void)
 224 {
 225     if (OPAL_SUCCESS != mca_btl_smcuda_component_verify()) {
 226         return OPAL_ERROR;
 227     }
 228 
 229     mca_btl_smcuda_component.sm_max_btls = 1;
 230 
 231     
 232     mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
 233 
 234     
 235     if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
 236         mca_btl_smcuda_component.fifo_lazy_free  = (mca_btl_smcuda_component.fifo_size >> 1);
 237     if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
 238         mca_btl_smcuda_component.fifo_lazy_free  = 1;
 239 
 240     mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
 241     mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
 242 
 243 #if OPAL_CUDA_SUPPORT
 244     
 245     if (mca_btl_smcuda.super.btl_cuda_max_send_size > mca_btl_smcuda.super.btl_max_send_size) {
 246         mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_cuda_max_send_size;
 247     }
 248     opal_output_verbose(10, opal_btl_base_framework.framework_output,
 249                         "btl: smcuda: cuda_max_send_size=%d, max_send_size=%d, max_frag_size=%d",
 250                         (int)mca_btl_smcuda.super.btl_cuda_max_send_size, (int)mca_btl_smcuda.super.btl_max_send_size,
 251                         (int)mca_btl_smcuda_component.max_frag_size);
 252 #endif 
 253 
 254     
 255     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
 256     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, opal_free_list_t);
 257     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, opal_free_list_t);
 258     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t);
 259     OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
 260     return OPAL_SUCCESS;
 261 }
 262 
 263 
 264 
 265 
 266 
 267 
 268 static int mca_btl_smcuda_component_close(void)
 269 {
 270     int return_value = OPAL_SUCCESS;
 271 
 272 
 273     OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
 274     
 275 
 276 
 277 
 278 
 279     
 280     
 281 
 282     
 283     if(mca_btl_smcuda_component.sm_seg != NULL) {
 284         return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
 285         if( OPAL_SUCCESS != return_value ) {
 286             return_value = OPAL_ERROR;
 287             opal_output(0," mca_common_sm_fini failed\n");
 288             goto CLEANUP;
 289         }
 290 
 291         
 292 
 293 
 294 
 295 #if OPAL_ENABLE_FT_CR    == 1
 296         
 297 
 298 
 299         if(OPAL_CR_STATUS_RESTART_PRE  != opal_cr_checkpointing_state &&
 300            OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
 301             unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
 302         }
 303 #else
 304         unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
 305 #endif
 306         OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
 307     }
 308 
 309 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 310     
 311     if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
 312         
 313         unsigned char cmd = DONE;
 314         if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
 315                 sizeof(cmd)){
 316             opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
 317                     errno);
 318         }
 319         opal_thread_join(&mca_btl_smcuda_component.sm_fifo_thread, NULL);
 320         close(mca_btl_smcuda_component.sm_fifo_fd);
 321         unlink(mca_btl_smcuda_component.sm_fifo_path);
 322     }
 323 #endif
 324 
 325 CLEANUP:
 326 
 327 #if OPAL_CUDA_SUPPORT
 328     mca_common_cuda_fini();
 329 #endif 
 330 
 331     
 332     return return_value;
 333 }
 334 
 335 
 336 
 337 
 338 static inline int
 339 get_num_local_procs(void)
 340 {
 341     
 342 
 343     return (int)(1 + opal_process_info.num_local_peers);
 344 }
 345 
 346 static void
 347 calc_sm_max_procs(int n)
 348 {
 349     
 350     if (0 > mca_btl_smcuda_component.sm_max_procs) {
 351         
 352         if (0 <= mca_btl_smcuda_component.sm_extra_procs) {
 353             
 354             mca_btl_smcuda_component.sm_max_procs =
 355                 n + mca_btl_smcuda_component.sm_extra_procs;
 356         } else {
 357             
 358             mca_btl_smcuda_component.sm_max_procs = 2 * n;
 359         }
 360     }
 361 }
 362 
 363 static int
 364 create_and_attach(mca_btl_smcuda_component_t *comp_ptr,
 365                   size_t size,
 366                   char *file_name,
 367                   size_t size_ctl_structure,
 368                   size_t data_seg_alignment,
 369                   mca_common_sm_module_t **out_modp)
 370 
 371 {
 372     if (NULL == (*out_modp =
 373         mca_common_sm_module_create_and_attach(size, file_name,
 374                                                size_ctl_structure,
 375                                                data_seg_alignment))) {
 376         opal_output(0, "create_and_attach: unable to create shared memory "
 377                     "BTL coordinating strucure :: size %lu \n",
 378                     (unsigned long)size);
 379         return OPAL_ERROR;
 380     }
 381     return OPAL_SUCCESS;
 382 }
 383 
 384 static int
 385 get_mpool_res_size(int32_t max_procs,
 386                    size_t *out_res_size)
 387 {
 388     size_t size = 0;
 389 
 390     *out_res_size = 0;
 391     
 392     
 393 
 394 
 395 
 396 
 397 
 398 
 399 
 400 
 401 
 402 
 403 
 404     size = FIFO_MAP_NUM(max_procs) *
 405            (sizeof(sm_fifo_t) + sizeof(void *) *
 406             mca_btl_smcuda_component.fifo_size + 4 * opal_cache_line_size) +
 407            (2 * max_procs + mca_btl_smcuda_component.sm_free_list_inc) *
 408            (mca_btl_smcuda_component.eager_limit + 2 * opal_cache_line_size) +
 409            mca_btl_smcuda_component.sm_free_list_num *
 410            (mca_btl_smcuda_component.max_frag_size + 2 * opal_cache_line_size);
 411 
 412     
 413     size += sizeof(mca_common_sm_module_t);
 414 
 415     
 416     
 417 
 418 
 419 
 420 
 421     if (((double)size) * max_procs > LONG_MAX - 4096) {
 422         return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
 423     }
 424     size *= (size_t)max_procs;
 425     *out_res_size = size;
 426     return OPAL_SUCCESS;
 427 }
 428 
 429 
 430 
 431 
 432 static int
 433 set_uniq_paths_for_init_rndv(mca_btl_smcuda_component_t *comp_ptr)
 434 {
 435     int rc = OPAL_ERR_OUT_OF_RESOURCE;
 436 
 437     
 438     comp_ptr->sm_mpool_ctl_file_name = NULL;
 439     comp_ptr->sm_mpool_rndv_file_name = NULL;
 440     comp_ptr->sm_ctl_file_name = NULL;
 441     comp_ptr->sm_rndv_file_name = NULL;
 442 
 443     if (opal_asprintf(&comp_ptr->sm_mpool_ctl_file_name,
 444                  "%s"OPAL_PATH_SEP"shared_mem_cuda_pool.%s",
 445                  opal_process_info.job_session_dir,
 446                  opal_process_info.nodename) < 0) {
 447         
 448         goto out;
 449     }
 450     if (opal_asprintf(&comp_ptr->sm_mpool_rndv_file_name,
 451                  "%s"OPAL_PATH_SEP"shared_mem_cuda_pool_rndv.%s",
 452                  opal_process_info.job_session_dir,
 453                  opal_process_info.nodename) < 0) {
 454         
 455         goto out;
 456     }
 457     if (opal_asprintf(&comp_ptr->sm_ctl_file_name,
 458                  "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_module.%s",
 459                  opal_process_info.job_session_dir,
 460                  opal_process_info.nodename) < 0) {
 461         
 462         goto out;
 463     }
 464     if (opal_asprintf(&comp_ptr->sm_rndv_file_name,
 465                  "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_rndv.%s",
 466                  opal_process_info.job_session_dir,
 467                  opal_process_info.nodename) < 0) {
 468         
 469         goto out;
 470     }
 471     
 472     rc = OPAL_SUCCESS;
 473 
 474 out:
 475     if (OPAL_SUCCESS != rc) {
 476         if (comp_ptr->sm_mpool_ctl_file_name) {
 477             free(comp_ptr->sm_mpool_ctl_file_name);
 478         }
 479         if (comp_ptr->sm_mpool_rndv_file_name) {
 480             free(comp_ptr->sm_mpool_rndv_file_name);
 481         }
 482         if (comp_ptr->sm_ctl_file_name) {
 483             free(comp_ptr->sm_ctl_file_name);
 484         }
 485         if (comp_ptr->sm_rndv_file_name) {
 486             free(comp_ptr->sm_rndv_file_name);
 487         }
 488     }
 489     return rc;
 490 }
 491 
 492 static int
 493 create_rndv_file(mca_btl_smcuda_component_t *comp_ptr,
 494                   mca_btl_sm_rndv_module_type_t type)
 495 {
 496     size_t size = 0;
 497     int rc = OPAL_SUCCESS;
 498     int fd = -1;
 499     char *fname = NULL;
 500     
 501     mca_common_sm_module_t *tmp_modp = NULL;
 502 
 503     if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
 504         
 505         if (OPAL_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs,
 506                                                      &size))) {
 507             
 508             goto out;
 509         }
 510 
 511         
 512         if (size < mca_btl_smcuda_component.mpool_min_size) {
 513             size = mca_btl_smcuda_component.mpool_min_size;
 514         }
 515 
 516         
 517 
 518 
 519         if (OPAL_SUCCESS != (rc =
 520             create_and_attach(comp_ptr, size, comp_ptr->sm_mpool_ctl_file_name,
 521                               sizeof(mca_common_sm_module_t), 8, &tmp_modp))) {
 522             
 523             goto out;
 524         }
 525         fname = comp_ptr->sm_mpool_rndv_file_name;
 526     }
 527     else if (MCA_BTL_SM_RNDV_MOD_SM == type) {
 528         
 529         size = sizeof(mca_common_sm_seg_header_t) +
 530                comp_ptr->sm_max_procs *
 531                (sizeof(sm_fifo_t *) +
 532                 sizeof(char *) + sizeof(uint16_t)) +
 533                opal_cache_line_size;
 534 
 535         if (OPAL_SUCCESS != (rc =
 536             create_and_attach(comp_ptr, size, comp_ptr->sm_ctl_file_name,
 537                               sizeof(mca_common_sm_seg_header_t),
 538                               opal_cache_line_size, &comp_ptr->sm_seg))) {
 539             
 540             goto out;
 541         }
 542         fname = comp_ptr->sm_rndv_file_name;
 543         tmp_modp = comp_ptr->sm_seg;
 544     }
 545     else {
 546         return OPAL_ERR_BAD_PARAM;
 547     }
 548 
 549     
 550 
 551 
 552     
 553 
 554     if (-1 == (fd = open(fname, O_CREAT | O_RDWR, 0600))) {
 555         int err = errno;
 556         opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 557                        "open(2)", strerror(err), err);
 558         rc = OPAL_ERR_IN_ERRNO;
 559         goto out;
 560     }
 561     if ((ssize_t)sizeof(opal_shmem_ds_t) != write(fd, &(tmp_modp->shmem_ds),
 562                                                   sizeof(opal_shmem_ds_t))) {
 563         int err = errno;
 564         opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 565                        "write(2)", strerror(err), err);
 566         rc = OPAL_ERR_IN_ERRNO;
 567         goto out;
 568     }
 569     if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
 570         if ((ssize_t)sizeof(size) != write(fd, &size, sizeof(size))) {
 571             int err = errno;
 572             opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 573                            "write(2)", strerror(err), err);
 574             rc = OPAL_ERR_IN_ERRNO;
 575             goto out;
 576         }
 577         
 578         OBJ_RELEASE(tmp_modp);
 579     }
 580 
 581 out:
 582     if (-1 != fd) {
 583         (void)close(fd);
 584     }
 585     return rc;
 586 }
 587 
 588 
 589 
 590 
 591 static int
 592 backing_store_init(mca_btl_smcuda_component_t *comp_ptr,
 593                    uint32_t local_rank)
 594 {
 595     int rc = OPAL_SUCCESS;
 596 
 597     if (OPAL_SUCCESS != (rc = set_uniq_paths_for_init_rndv(comp_ptr))) {
 598         goto out;
 599     }
 600     
 601     if (0 == local_rank) {
 602         
 603         if (OPAL_SUCCESS != (rc =
 604             create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_MPOOL))) {
 605             goto out;
 606         }
 607         
 608         if (OPAL_SUCCESS != (rc =
 609             create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_SM))) {
 610             goto out;
 611         }
 612     }
 613 
 614 out:
 615     return rc;
 616 }
 617 
 618 #if OPAL_CUDA_SUPPORT
 619 
 620 
 621 
 622 
 623 
 624 
 625 
 626 
 627 static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl,
 628                                              struct mca_btl_base_endpoint_t* endpoint, int ready)
 629 {
 630     mca_btl_smcuda_frag_t* frag;
 631     ctrlhdr_t ctrlhdr;
 632     int rc;
 633 
 634     if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
 635         mca_btl_smcuda_component_progress();
 636     }
 637 
 638     
 639     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
 640     if( OPAL_UNLIKELY(NULL == frag) ) {
 641         endpoint->ipcstate = IPC_BAD;
 642         return;
 643     }
 644 
 645     if (ready) {
 646         ctrlhdr.ctag = IPC_ACK;
 647     } else {
 648         ctrlhdr.ctag = IPC_NOTREADY;
 649     }
 650 
 651     
 652     frag->hdr->tag = MCA_BTL_TAG_SMCUDA;
 653     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
 654     frag->endpoint = endpoint;
 655     memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
 656 
 657     
 658     
 659 
 660 
 661 
 662 
 663     OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
 664 
 665     MCA_BTL_SMCUDA_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
 666                               endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
 667 
 668     
 669     if (ready) {
 670         endpoint->ipcstate = IPC_ACKED;
 671     } else {
 672         endpoint->ipcstate = IPC_INIT;
 673     }
 674 
 675     return;
 676 
 677 }
 678 
 679 
 680 
 681 static void btl_smcuda_control(mca_btl_base_module_t* btl,
 682                                mca_btl_base_tag_t tag,
 683                                mca_btl_base_descriptor_t* des, void* cbdata)
 684 {
 685     int mydevnum, ipcaccess, res;
 686     ctrlhdr_t ctrlhdr;
 687     opal_proc_t *ep_proc;
 688     struct mca_btl_base_endpoint_t *endpoint;
 689     mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
 690     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
 691     mca_btl_base_segment_t* segments = des->des_segments;
 692 
 693     
 694 
 695     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
 696     ep_proc = endpoint->proc_opal;
 697 
 698     
 699     memcpy(&ctrlhdr, segments->seg_addr.pval, sizeof(struct ctrlhdr_st));
 700 
 701     
 702     switch (ctrlhdr.ctag) {
 703     case IPC_REQ:
 704         
 705 
 706 
 707 
 708 
 709 
 710 
 711         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
 712         if ((IPC_INIT == endpoint->ipcstate) ||
 713             ((IPC_SENT == endpoint->ipcstate) && (endpoint->my_smp_rank > endpoint->peer_smp_rank))) {
 714             endpoint->ipcstate = IPC_ACKING; 
 715             OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 716 
 717             
 718             if (!mca_common_cuda_enabled) {
 719                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 720                                     "Sending CUDA IPC NOTREADY: myrank=%d, peerrank=%d",
 721                                     mca_btl_smcuda_component.my_smp_rank,
 722                                     endpoint->peer_smp_rank);
 723                 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 0);
 724                 return;
 725             }
 726 
 727             
 728 
 729             res = mca_common_cuda_get_device(&mydevnum);
 730             if (0 != res) {
 731                 endpoint->ipcstate = IPC_BAD;
 732                 return;
 733             }
 734 
 735             
 736 
 737 
 738 
 739 
 740 
 741 
 742             if (mydevnum == ctrlhdr.cudev) {
 743                 if (mca_btl_smcuda_component.use_cuda_ipc_same_gpu) {
 744                     ipcaccess = 1;
 745                 } else {
 746                     opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 747                                         "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 748                                         "peerdev=%d --> Access is disabled by btl_smcuda_use_cuda_ipc_same_gpu",
 749                                         endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 750                                         ctrlhdr.cudev);
 751                     endpoint->ipcstate = IPC_BAD;
 752                     return;
 753                 }
 754             } else {
 755                 res = mca_common_cuda_device_can_access_peer(&ipcaccess, mydevnum, ctrlhdr.cudev);
 756                 if (0 != res) {
 757                     opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 758                                         "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 759                                         "peerdev=%d --> Access is disabled because peer check failed with err=%d",
 760                                         endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 761                                         ctrlhdr.cudev, res);
 762                     endpoint->ipcstate = IPC_BAD;
 763                     return;
 764                 }
 765             }
 766 
 767             assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank);
 768             opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 769                                 "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 770                                 "peerdev=%d --> ACCESS=%d",
 771                                 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 772                                 ctrlhdr.cudev, ipcaccess);
 773 
 774             if (0 == ipcaccess) {
 775                 
 776                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 777                                     "Not sending CUDA IPC ACK, no P2P support");
 778                 endpoint->ipcstate = IPC_BAD;
 779             } else {
 780                 
 781                 smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
 782                                      ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
 783                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 784                                     "Sending CUDA IPC ACK:  myrank=%d, mydev=%d, peerrank=%d, peerdev=%d",
 785                                     endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 786                                     ctrlhdr.cudev);
 787                 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 1);
 788             }
 789         } else {
 790             OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 791             opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 792                                 "Not sending CUDA IPC ACK because request already initiated");
 793         }
 794         break;
 795 
 796     case IPC_ACK:
 797         opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 798                             "Received CUDA IPC ACK, notifying PML: myrank=%d, peerrank=%d",
 799                             endpoint->my_smp_rank, endpoint->peer_smp_rank);
 800 
 801         smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
 802                              ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
 803         assert(endpoint->ipcstate == IPC_SENT);
 804         endpoint->ipcstate = IPC_ACKED;
 805         break;
 806 
 807     case IPC_NOTREADY:
 808         
 809 
 810         opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 811                             "Received CUDA IPC NOTREADY, reset state to allow another attempt: "
 812                             "myrank=%d, peerrank=%d",
 813                             endpoint->my_smp_rank, endpoint->peer_smp_rank);
 814         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
 815         if (IPC_SENT == endpoint->ipcstate) {
 816             endpoint->ipcstate = IPC_INIT;
 817         }
 818         OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 819         break;
 820 
 821     default:
 822         opal_output(0, "Received UNKNOWN CUDA IPC control message. This should not happen.");
 823     }
 824 }
 825 
 826 #endif 
 827 
 828 
 829 
 830 
 831 static mca_btl_base_module_t **
 832 mca_btl_smcuda_component_init(int *num_btls,
 833                           bool enable_progress_threads,
 834                           bool enable_mpi_threads)
 835 {
 836     int num_local_procs = 0;
 837     mca_btl_base_module_t **btls = NULL;
 838     uint32_t my_local_rank = UINT32_MAX;
 839 
 840     *num_btls = 0;
 841     
 842     mca_btl_smcuda_component.sm_mpool = NULL;
 843     mca_btl_smcuda_component.sm_mpool_base = NULL;
 844 
 845 #if OPAL_CUDA_SUPPORT
 846     mca_common_cuda_stage_one_init();
 847 #endif 
 848 
 849     
 850     if (NULL == opal_process_info.job_session_dir) {
 851     
 852 
 853 
 854         return NULL;
 855     }
 856     
 857 
 858 
 859 
 860 
 861 
 862     if (UINT32_MAX ==
 863         (my_local_rank = opal_process_info.my_local_rank)) {
 864         opal_show_help("help-mpi-btl-smcuda.txt", "no locality", true);
 865         return NULL;
 866     }
 867     
 868     if ((num_local_procs = get_num_local_procs()) < 2) {
 869         return NULL;
 870     }
 871     
 872 
 873     calc_sm_max_procs(num_local_procs);
 874 
 875     
 876 
 877 
 878 
 879     if (OPAL_SUCCESS != backing_store_init(&mca_btl_smcuda_component,
 880                                            my_local_rank)) {
 881         return NULL;
 882     }
 883 
 884 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 885     
 886     sprintf( mca_btl_smcuda_component.sm_fifo_path,
 887              "%s"OPAL_PATH_SEP"sm_fifo.%lu", opal_process_info.job_session_dir,
 888              (unsigned long)OPAL_PROC_MY_NAME->vpid );
 889     if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
 890         opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
 891         return NULL;
 892     }
 893     mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path,
 894                                            O_RDWR);
 895     if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
 896         opal_output(0, "mca_btl_smcuda_component_init: "
 897                    "open(%s) failed with errno=%d\n",
 898                     mca_btl_smcuda_component.sm_fifo_path, errno);
 899         return NULL;
 900     }
 901 
 902     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
 903     mca_btl_smcuda_component.sm_fifo_thread.t_run =
 904         (opal_thread_fn_t)mca_btl_smcuda_component_event_thread;
 905     opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
 906 #endif
 907 
 908     mca_btl_smcuda_component.sm_btls =
 909         (mca_btl_smcuda_t **)malloc(mca_btl_smcuda_component.sm_max_btls *
 910                                 sizeof(mca_btl_smcuda_t *));
 911     if (NULL == mca_btl_smcuda_component.sm_btls) {
 912         return NULL;
 913     }
 914 
 915     
 916     *num_btls = 1;
 917     btls = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*));
 918     if (NULL == btls) {
 919         return NULL;
 920     }
 921 
 922     
 923     btls[0] = (mca_btl_base_module_t*)(&(mca_btl_smcuda));
 924     mca_btl_smcuda_component.sm_btls[0] = (mca_btl_smcuda_t*)(&(mca_btl_smcuda));
 925 
 926     
 927     
 928     mca_btl_smcuda_component.num_smp_procs = 0;
 929     mca_btl_smcuda_component.my_smp_rank   = -1;  
 930     mca_btl_smcuda_component.sm_num_btls   = 1;
 931     
 932     mca_btl_smcuda.btl_inited = false;
 933 
 934 #if OPAL_CUDA_SUPPORT
 935     
 936     mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
 937     
 938     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
 939     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
 940 #endif 
 941 
 942     return btls;
 943 
 944 }
 945 
 946 
 947 
 948 
 949 
 950 
 951 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 952 void mca_btl_smcuda_component_event_thread(opal_object_t* thread)
 953 {
 954     while(1) {
 955         unsigned char cmd;
 956         if(read(mca_btl_smcuda_component.sm_fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) {
 957             
 958             return;
 959         }
 960         if( DONE == cmd ){
 961             
 962             return;
 963         }
 964         mca_btl_smcuda_component_progress();
 965     }
 966 }
 967 #endif
 968 
 969 void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep)
 970 {
 971     btl_smcuda_pending_send_item_t *si;
 972     int rc;
 973 
 974     while ( 0 < opal_list_get_size(&ep->pending_sends) ) {
 975         
 976 
 977 
 978 
 979         OPAL_THREAD_LOCK(&ep->endpoint_lock);
 980         si = (btl_smcuda_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
 981         OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
 982 
 983         if(NULL == si) return; 
 984 
 985         OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_pending_sends, -1);
 986 
 987         MCA_BTL_SMCUDA_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
 988                           true, false, rc);
 989 
 990         opal_free_list_return (&mca_btl_smcuda_component.pending_send_fl, (opal_free_list_item_t*)si);
 991 
 992         if ( OPAL_SUCCESS != rc )
 993             return;
 994     }
 995 }
 996 
 997 int mca_btl_smcuda_component_progress(void)
 998 {
 999     
1000     mca_btl_base_segment_t seg;
1001     mca_btl_smcuda_frag_t *frag;
1002     mca_btl_smcuda_frag_t Frag;
1003     sm_fifo_t *fifo = NULL;
1004     mca_btl_smcuda_hdr_t *hdr;
1005     int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
1006     int peer_smp_rank, j, rc = 0, nevents = 0;
1007 
1008     
1009     
1010     if ( 0 < mca_btl_smcuda_component.num_pending_sends ) {
1011 
1012         
1013         
1014         for ( peer_smp_rank = 0; peer_smp_rank < mca_btl_smcuda_component.num_smp_procs; peer_smp_rank++) {
1015             struct mca_btl_base_endpoint_t* endpoint;
1016             if ( peer_smp_rank == my_smp_rank )
1017                 continue;
1018             endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank];
1019             if ( 0 < opal_list_get_size(&endpoint->pending_sends) )
1020                 btl_smcuda_process_pending_sends(endpoint);
1021         }
1022     }
1023 
1024     
1025     for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
1026         fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
1027       recheck_peer:
1028         
1029         if(opal_using_threads()) {
1030             opal_atomic_lock(&(fifo->tail_lock));
1031         }
1032 
1033         hdr = (mca_btl_smcuda_hdr_t *)sm_fifo_read(fifo);
1034 
1035         
1036         if(opal_using_threads()) {
1037             opal_atomic_unlock(&(fifo->tail_lock));
1038         }
1039 
1040         if(SM_FIFO_FREE == hdr) {
1041             continue;
1042         }
1043 
1044         nevents++;
1045         
1046         switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
1047             case MCA_BTL_SMCUDA_FRAG_SEND:
1048             {
1049                 mca_btl_active_message_callback_t* reg;
1050                 
1051 
1052                 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1053                 peer_smp_rank = hdr->my_smp_rank;
1054 #if OPAL_ENABLE_DEBUG
1055                 if ( FIFO_MAP(peer_smp_rank) != j ) {
1056                     opal_output(0, "mca_btl_smcuda_component_progress: "
1057                                 "rank %d got %d on FIFO %d, but this sender should send to FIFO %d\n",
1058                                 my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
1059                 }
1060 #endif
1061                 
1062                 reg = mca_btl_base_active_message_trigger + hdr->tag;
1063                 seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
1064                 seg.seg_len = hdr->len;
1065                 Frag.base.des_segment_count = 1;
1066                 Frag.base.des_segments = &seg;
1067 #if OPAL_CUDA_SUPPORT
1068                 Frag.hdr = hdr;  
1069 #endif 
1070                 reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
1071                             reg->cbdata);
1072                 
1073                 MCA_BTL_SMCUDA_FIFO_WRITE(
1074                         mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1075                         my_smp_rank, peer_smp_rank, hdr->frag, false, true, rc);
1076                 break;
1077             }
1078         case MCA_BTL_SMCUDA_FRAG_ACK:
1079             {
1080                 int status = (uintptr_t)hdr & MCA_BTL_SMCUDA_FRAG_STATUS_MASK;
1081                 int btl_ownership;
1082                 struct mca_btl_base_endpoint_t* endpoint;
1083 
1084                 frag = (mca_btl_smcuda_frag_t *)((char*)((uintptr_t)hdr &
1085                                                      (~(MCA_BTL_SMCUDA_FRAG_TYPE_MASK |
1086                                                         MCA_BTL_SMCUDA_FRAG_STATUS_MASK))));
1087 
1088                 endpoint = frag->endpoint;
1089                 btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
1090                 if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
1091                     
1092                     frag->base.des_cbfunc(&mca_btl_smcuda.super, frag->endpoint,
1093                                           &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
1094                 }
1095                 if( btl_ownership ) {
1096                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1097                 }
1098                 OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
1099                 if ( 0 < opal_list_get_size(&endpoint->pending_sends) ) {
1100                     btl_smcuda_process_pending_sends(endpoint);
1101                 }
1102                 goto recheck_peer;
1103             }
1104             default:
1105                 
1106                 
1107 
1108 
1109 
1110 
1111 
1112 
1113 
1114 
1115 
1116                 opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1117                 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1118                 peer_smp_rank = hdr->my_smp_rank;
1119                 hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1120                         MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1121                 MCA_BTL_SMCUDA_FIFO_WRITE(
1122                         mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1123                         my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1124                 break;
1125         }
1126     }
1127     (void)rc; 
1128 
1129 #if OPAL_CUDA_SUPPORT
1130     
1131 
1132 
1133     while (1 == progress_one_cuda_ipc_event((mca_btl_base_descriptor_t **)&frag)) {
1134         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag->base.des_cbfunc;
1135 
1136         cbfunc (&mca_btl_smcuda.super, frag->endpoint, frag->segment.seg_addr.pval,
1137                 frag->local_handle, frag->base.des_context, frag->base.des_cbdata,
1138                 OPAL_SUCCESS);
1139 
1140         if(frag->registration != NULL) {
1141             frag->endpoint->rcache->rcache_deregister (frag->endpoint->rcache,
1142                                                        (mca_rcache_base_registration_t*)frag->registration);
1143             frag->registration = NULL;
1144             MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1145         }
1146         nevents++;
1147     }
1148 #endif 
1149     return nevents;
1150 }