opal/mca/btl/smcuda/btl_smcuda

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
mca_btl_smcuda_param_register_int
mca_btl_smcuda_param_register_uint
mca_btl_smcuda_component_verify
smcuda_register
mca_btl_smcuda_component_open
mca_btl_smcuda_component_close
get_num_local_procs
calc_sm_max_procs
create_and_attach
get_mpool_res_size
set_uniq_paths_for_init_rndv
create_rndv_file
backing_store_init
mca_btl_smcuda_send_cuda_ipc_ack
btl_smcuda_control
mca_btl_smcuda_component_init
mca_btl_smcuda_component_event_thread
btl_smcuda_process_pending_sends
mca_btl_smcuda_component_progress
   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2009 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2007 Voltaire. All rights reserved.
  14  * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
  15  * Copyright (c) 2010-2016 Los Alamos National Security, LLC. All rights
  16  *                         reserved.
  17  * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
  18  * Copyright (c) 2014      Intel, Inc. All rights reserved.
  19  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 #include "opal_config.h"
  27 #include <errno.h>
  28 #ifdef HAVE_UNISTD_H
  29 #include <unistd.h>
  30 #endif  /* HAVE_UNISTD_H */
  31 #include <string.h>
  32 #ifdef HAVE_FCNTL_H
  33 #include <fcntl.h>
  34 #endif  /* HAVE_FCNTL_H */
  35 #ifdef HAVE_SYS_TYPES_H
  36 #include <sys/types.h>
  37 #endif  /* HAVE_SYS_TYPES_H */
  38 #ifdef HAVE_SYS_MMAN_H
  39 #include <sys/mman.h>
  40 #endif  /* HAVE_SYS_MMAN_H */
  41 #ifdef HAVE_SYS_STAT_H
  42 #include <sys/stat.h>  /* for mkfifo */
  43 #endif  /* HAVE_SYS_STAT_H */
  44 
  45 #include "opal/mca/shmem/base/base.h"
  46 #include "opal/mca/shmem/shmem.h"
  47 #include "opal/util/bit_ops.h"
  48 #include "opal/util/output.h"
  49 #include "opal/util/show_help.h"
  50 #include "opal/util/printf.h"
  51 
  52 #include "opal/mca/mpool/base/base.h"
  53 #include "opal/mca/common/sm/common_sm.h"
  54 #include "opal/mca/btl/base/btl_base_error.h"
  55 #include "opal/runtime/opal_params.h"
  56 
  57 #if OPAL_CUDA_SUPPORT
  58 #include "opal/mca/common/cuda/common_cuda.h"
  59 #endif /* OPAL_CUDA_SUPPORT */
  60 #if OPAL_ENABLE_FT_CR    == 1
  61 #include "opal/runtime/opal_cr.h"
  62 #endif
  63 
  64 #include "btl_smcuda.h"
  65 #include "btl_smcuda_frag.h"
  66 #include "btl_smcuda_fifo.h"
  67 
  68 static int mca_btl_smcuda_component_open(void);
  69 static int mca_btl_smcuda_component_close(void);
  70 static int smcuda_register(void);
  71 static mca_btl_base_module_t** mca_btl_smcuda_component_init(
  72     int *num_btls,
  73     bool enable_progress_threads,
  74     bool enable_mpi_threads
  75 );
  76 
  77 typedef enum {
  78     MCA_BTL_SM_RNDV_MOD_SM = 0,
  79     MCA_BTL_SM_RNDV_MOD_MPOOL
  80 } mca_btl_sm_rndv_module_type_t;
  81 
  82 /*
  83  * Shared Memory (SM) component instance.
  84  */
  85 mca_btl_smcuda_component_t mca_btl_smcuda_component = {
  86     .super = {
  87         /* First, the mca_base_component_t struct containing meta information
  88           about the component itself */
  89         .btl_version = {
  90             MCA_BTL_DEFAULT_VERSION("smcuda"),
  91             .mca_open_component = mca_btl_smcuda_component_open,
  92             .mca_close_component = mca_btl_smcuda_component_close,
  93             .mca_register_component_params = smcuda_register,
  94         },
  95         .btl_data = {
  96             /* The component is checkpoint ready */
  97             .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
  98         },
  99 
 100         .btl_init = mca_btl_smcuda_component_init,
 101         .btl_progress = mca_btl_smcuda_component_progress,
 102     }  /* end super */
 103 };
 104 
 105 
 106 /*
 107  * utility routines for parameter registration
 108  */
 109 
 110 static inline int mca_btl_smcuda_param_register_int(
 111     const char* param_name,
 112     int default_value,
 113     int level,
 114     int *storage)
 115 {
 116     *storage = default_value;
 117     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
 118                                             param_name, NULL, MCA_BASE_VAR_TYPE_INT,
 119                                             NULL, 0, 0, level,
 120                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
 121     return *storage;
 122 }
 123 
 124 static inline unsigned int mca_btl_smcuda_param_register_uint(
 125     const char* param_name,
 126     unsigned int default_value,
 127     int level,
 128     unsigned int *storage)
 129 {
 130     *storage = default_value;
 131     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version,
 132                                             param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
 133                                             NULL, 0, 0, level,
 134                                             MCA_BASE_VAR_SCOPE_READONLY, storage);
 135     return *storage;
 136 }
 137 
 138 static int mca_btl_smcuda_component_verify(void) {
 139 
 140     return mca_btl_base_param_verify(&mca_btl_smcuda.super);
 141 }
 142 
 143 static int smcuda_register(void)
 144 {
 145     /* register SM component parameters */
 146     mca_btl_smcuda_component.mpool_min_size = 134217728;
 147     (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "min_size",
 148                                            "Minimum size of the common/sm mpool shared memory file",
 149                                            MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0,
 150                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
 151                                            &mca_btl_smcuda_component.mpool_min_size);
 152 
 153     mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
 154     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
 155     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
 156     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
 157     /* there is no practical use for the mpool name parameter since mpool resources differ
 158        between components */
 159     mca_btl_smcuda_component.sm_mpool_name = "sm";
 160     mca_btl_smcuda_param_register_uint("fifo_size", 4096, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.fifo_size);
 161     mca_btl_smcuda_param_register_int("num_fifos", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.nfifos);
 162 
 163     mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.fifo_lazy_free);
 164 
 165     /* default number of extra procs to allow for future growth */
 166     mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9, &mca_btl_smcuda_component.sm_extra_procs);
 167 
 168     mca_btl_smcuda_component.allocator = "bucket";
 169     (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version, "allocator",
 170                                             "Name of allocator component to use for btl/smcuda allocations",
 171                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
 172                                             MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_smcuda_component.allocator);
 173 
 174 #if OPAL_CUDA_SUPPORT
 175     /* Lower priority when CUDA support is not requested */
 176     if (opal_cuda_support) {
 177         mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH+1;
 178     } else {
 179         mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 180     }
 181     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
 182     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
 183     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
 184     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
 185     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 186 #else /* OPAL_CUDA_SUPPORT */
 187     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 188 #endif /* OPAL_CUDA_SUPPORT */
 189     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
 190     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
 191     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
 192     mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
 193     mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
 194     mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
 195     mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
 196     mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
 197     mca_btl_smcuda.super.btl_bandwidth = 9000;  /* Mbs */
 198     mca_btl_smcuda.super.btl_latency   = 1;     /* Microsecs */
 199 
 200     /* Call the BTL based to register its MCA params */
 201     mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
 202                                 &mca_btl_smcuda.super);
 203 #if OPAL_CUDA_SUPPORT
 204     /* If user has not set the value, then set to the defalt */
 205     if (0 == mca_btl_smcuda.super.btl_cuda_max_send_size) {
 206         mca_btl_smcuda.super.btl_cuda_max_send_size = 128*1024;
 207     }
 208     /* If user has not set the value, then set to magic number which will be converted to the minimum
 209      * size needed to fit the PML header (see pml_ob1.c) */
 210     if (0 == mca_btl_smcuda.super.btl_cuda_eager_limit) {
 211         mca_btl_smcuda.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number */
 212     }
 213     mca_common_cuda_register_mca_variables();
 214 #endif /* OPAL_CUDA_SUPPORT */
 215     return mca_btl_smcuda_component_verify();
 216 }
 217 
 218 /*
 219  *  Called by MCA framework to open the component, registers
 220  *  component parameters.
 221  */
 222 
 223 static int mca_btl_smcuda_component_open(void)
 224 {
 225     if (OPAL_SUCCESS != mca_btl_smcuda_component_verify()) {
 226         return OPAL_ERROR;
 227     }
 228 
 229     mca_btl_smcuda_component.sm_max_btls = 1;
 230 
 231     /* make sure the number of fifos is a power of 2 */
 232     mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
 233 
 234     /* make sure that queue size and lazy free parameter are compatible */
 235     if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
 236         mca_btl_smcuda_component.fifo_lazy_free  = (mca_btl_smcuda_component.fifo_size >> 1);
 237     if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
 238         mca_btl_smcuda_component.fifo_lazy_free  = 1;
 239 
 240     mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
 241     mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
 242 
 243 #if OPAL_CUDA_SUPPORT
 244     /* Possibly adjust max_frag_size if the cuda size is bigger */
 245     if (mca_btl_smcuda.super.btl_cuda_max_send_size > mca_btl_smcuda.super.btl_max_send_size) {
 246         mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_cuda_max_send_size;
 247     }
 248     opal_output_verbose(10, opal_btl_base_framework.framework_output,
 249                         "btl: smcuda: cuda_max_send_size=%d, max_send_size=%d, max_frag_size=%d",
 250                         (int)mca_btl_smcuda.super.btl_cuda_max_send_size, (int)mca_btl_smcuda.super.btl_max_send_size,
 251                         (int)mca_btl_smcuda_component.max_frag_size);
 252 #endif /* OPAL_CUDA_SUPPORT */
 253 
 254     /* initialize objects */
 255     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
 256     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, opal_free_list_t);
 257     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, opal_free_list_t);
 258     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t);
 259     OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
 260     return OPAL_SUCCESS;
 261 }
 262 
 263 
 264 /*
 265  * component cleanup - sanity checking of queue lengths
 266  */
 267 
 268 static int mca_btl_smcuda_component_close(void)
 269 {
 270     int return_value = OPAL_SUCCESS;
 271 
 272 
 273     OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
 274     /**
 275      * We don't have to destroy the fragment lists. They are allocated
 276      * directly into the mmapped file, they will auto-magically disappear
 277      * when the file get unmapped.
 278      */
 279     /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_eager);*/
 280     /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_max);*/
 281 
 282     /* unmap the shared memory control structure */
 283     if(mca_btl_smcuda_component.sm_seg != NULL) {
 284         return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
 285         if( OPAL_SUCCESS != return_value ) {
 286             return_value = OPAL_ERROR;
 287             opal_output(0," mca_common_sm_fini failed\n");
 288             goto CLEANUP;
 289         }
 290 
 291         /* unlink file, so that it will be deleted when all references
 292          * to it are gone - no error checking, since we want all procs
 293          * to call this, so that in an abnormal termination scenario,
 294          * this file will still get cleaned up */
 295 #if OPAL_ENABLE_FT_CR    == 1
 296         /* Only unlink the file if we are *not* restarting
 297          * If we are restarting the file will be unlinked at a later time.
 298          */
 299         if(OPAL_CR_STATUS_RESTART_PRE  != opal_cr_checkpointing_state &&
 300            OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
 301             unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
 302         }
 303 #else
 304         unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
 305 #endif
 306         OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
 307     }
 308 
 309 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 310     /* close/cleanup fifo create for event notification */
 311     if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
 312         /* write a done message down the pipe */
 313         unsigned char cmd = DONE;
 314         if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
 315                 sizeof(cmd)){
 316             opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
 317                     errno);
 318         }
 319         opal_thread_join(&mca_btl_smcuda_component.sm_fifo_thread, NULL);
 320         close(mca_btl_smcuda_component.sm_fifo_fd);
 321         unlink(mca_btl_smcuda_component.sm_fifo_path);
 322     }
 323 #endif
 324 
 325 CLEANUP:
 326 
 327 #if OPAL_CUDA_SUPPORT
 328     mca_common_cuda_fini();
 329 #endif /* OPAL_CUDA_SUPPORT */
 330 
 331     /* return */
 332     return return_value;
 333 }
 334 
 335 /*
 336  * Returns the number of processes on the node.
 337  */
 338 static inline int
 339 get_num_local_procs(void)
 340 {
 341     /* num_local_peers does not include us in
 342      * its calculation, so adjust for that */
 343     return (int)(1 + opal_process_info.num_local_peers);
 344 }
 345 
 346 static void
 347 calc_sm_max_procs(int n)
 348 {
 349     /* see if need to allocate space for extra procs */
 350     if (0 > mca_btl_smcuda_component.sm_max_procs) {
 351         /* no limit */
 352         if (0 <= mca_btl_smcuda_component.sm_extra_procs) {
 353             /* limit */
 354             mca_btl_smcuda_component.sm_max_procs =
 355                 n + mca_btl_smcuda_component.sm_extra_procs;
 356         } else {
 357             /* no limit */
 358             mca_btl_smcuda_component.sm_max_procs = 2 * n;
 359         }
 360     }
 361 }
 362 
 363 static int
 364 create_and_attach(mca_btl_smcuda_component_t *comp_ptr,
 365                   size_t size,
 366                   char *file_name,
 367                   size_t size_ctl_structure,
 368                   size_t data_seg_alignment,
 369                   mca_common_sm_module_t **out_modp)
 370 
 371 {
 372     if (NULL == (*out_modp =
 373         mca_common_sm_module_create_and_attach(size, file_name,
 374                                                size_ctl_structure,
 375                                                data_seg_alignment))) {
 376         opal_output(0, "create_and_attach: unable to create shared memory "
 377                     "BTL coordinating strucure :: size %lu \n",
 378                     (unsigned long)size);
 379         return OPAL_ERROR;
 380     }
 381     return OPAL_SUCCESS;
 382 }
 383 
 384 static int
 385 get_mpool_res_size(int32_t max_procs,
 386                    size_t *out_res_size)
 387 {
 388     size_t size = 0;
 389 
 390     *out_res_size = 0;
 391     /* determine how much memory to create */
 392     /*
 393      * This heuristic formula mostly says that we request memory for:
 394      * - nfifos FIFOs, each comprising:
 395      *   . a sm_fifo_t structure
 396      *   . many pointers (fifo_size of them per FIFO)
 397      * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
 398      * - max fragments (sm_free_list_num of them)
 399      *
 400      * On top of all that, we sprinkle in some number of
 401      * "opal_cache_line_size" additions to account for some
 402      * padding and edge effects that may lie in the allocator.
 403      */
 404     size = FIFO_MAP_NUM(max_procs) *
 405            (sizeof(sm_fifo_t) + sizeof(void *) *
 406             mca_btl_smcuda_component.fifo_size + 4 * opal_cache_line_size) +
 407            (2 * max_procs + mca_btl_smcuda_component.sm_free_list_inc) *
 408            (mca_btl_smcuda_component.eager_limit + 2 * opal_cache_line_size) +
 409            mca_btl_smcuda_component.sm_free_list_num *
 410            (mca_btl_smcuda_component.max_frag_size + 2 * opal_cache_line_size);
 411 
 412     /* add something for the control structure */
 413     size += sizeof(mca_common_sm_module_t);
 414 
 415     /* before we multiply by max_procs, make sure the result won't overflow */
 416     /* Stick that little pad in, particularly since we'll eventually
 417      * need a little extra space.  E.g., in mca_mpool_sm_init() in
 418      * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
 419      * added.
 420      */
 421     if (((double)size) * max_procs > LONG_MAX - 4096) {
 422         return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
 423     }
 424     size *= (size_t)max_procs;
 425     *out_res_size = size;
 426     return OPAL_SUCCESS;
 427 }
 428 
 429 
 430 /* Generates all the unique paths for the shared-memory segments that this BTL
 431  * needs along with other file paths used to share "connection information". */
 432 static int
 433 set_uniq_paths_for_init_rndv(mca_btl_smcuda_component_t *comp_ptr)
 434 {
 435     int rc = OPAL_ERR_OUT_OF_RESOURCE;
 436 
 437     /* NOTE: don't forget to free these after init */
 438     comp_ptr->sm_mpool_ctl_file_name = NULL;
 439     comp_ptr->sm_mpool_rndv_file_name = NULL;
 440     comp_ptr->sm_ctl_file_name = NULL;
 441     comp_ptr->sm_rndv_file_name = NULL;
 442 
 443     if (opal_asprintf(&comp_ptr->sm_mpool_ctl_file_name,
 444                  "%s"OPAL_PATH_SEP"shared_mem_cuda_pool.%s",
 445                  opal_process_info.job_session_dir,
 446                  opal_process_info.nodename) < 0) {
 447         /* rc set */
 448         goto out;
 449     }
 450     if (opal_asprintf(&comp_ptr->sm_mpool_rndv_file_name,
 451                  "%s"OPAL_PATH_SEP"shared_mem_cuda_pool_rndv.%s",
 452                  opal_process_info.job_session_dir,
 453                  opal_process_info.nodename) < 0) {
 454         /* rc set */
 455         goto out;
 456     }
 457     if (opal_asprintf(&comp_ptr->sm_ctl_file_name,
 458                  "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_module.%s",
 459                  opal_process_info.job_session_dir,
 460                  opal_process_info.nodename) < 0) {
 461         /* rc set */
 462         goto out;
 463     }
 464     if (opal_asprintf(&comp_ptr->sm_rndv_file_name,
 465                  "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_rndv.%s",
 466                  opal_process_info.job_session_dir,
 467                  opal_process_info.nodename) < 0) {
 468         /* rc set */
 469         goto out;
 470     }
 471     /* all is well */
 472     rc = OPAL_SUCCESS;
 473 
 474 out:
 475     if (OPAL_SUCCESS != rc) {
 476         if (comp_ptr->sm_mpool_ctl_file_name) {
 477             free(comp_ptr->sm_mpool_ctl_file_name);
 478         }
 479         if (comp_ptr->sm_mpool_rndv_file_name) {
 480             free(comp_ptr->sm_mpool_rndv_file_name);
 481         }
 482         if (comp_ptr->sm_ctl_file_name) {
 483             free(comp_ptr->sm_ctl_file_name);
 484         }
 485         if (comp_ptr->sm_rndv_file_name) {
 486             free(comp_ptr->sm_rndv_file_name);
 487         }
 488     }
 489     return rc;
 490 }
 491 
 492 static int
 493 create_rndv_file(mca_btl_smcuda_component_t *comp_ptr,
 494                   mca_btl_sm_rndv_module_type_t type)
 495 {
 496     size_t size = 0;
 497     int rc = OPAL_SUCCESS;
 498     int fd = -1;
 499     char *fname = NULL;
 500     /* used as a temporary store so we can extract shmem_ds info */
 501     mca_common_sm_module_t *tmp_modp = NULL;
 502 
 503     if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
 504         /* get the segment size for the sm mpool. */
 505         if (OPAL_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs,
 506                                                      &size))) {
 507             /* rc is already set */
 508             goto out;
 509         }
 510 
 511         /* update size if less than required minimum */
 512         if (size < mca_btl_smcuda_component.mpool_min_size) {
 513             size = mca_btl_smcuda_component.mpool_min_size;
 514         }
 515 
 516         /* we only need the shmem_ds info at this point. initilization will be
 517          * completed in the mpool module code. the idea is that we just need this
 518          * info so we can populate the rndv file (or modex when we have it). */
 519         if (OPAL_SUCCESS != (rc =
 520             create_and_attach(comp_ptr, size, comp_ptr->sm_mpool_ctl_file_name,
 521                               sizeof(mca_common_sm_module_t), 8, &tmp_modp))) {
 522             /* rc is set */
 523             goto out;
 524         }
 525         fname = comp_ptr->sm_mpool_rndv_file_name;
 526     }
 527     else if (MCA_BTL_SM_RNDV_MOD_SM == type) {
 528         /* calculate the segment size. */
 529         size = sizeof(mca_common_sm_seg_header_t) +
 530                comp_ptr->sm_max_procs *
 531                (sizeof(sm_fifo_t *) +
 532                 sizeof(char *) + sizeof(uint16_t)) +
 533                opal_cache_line_size;
 534 
 535         if (OPAL_SUCCESS != (rc =
 536             create_and_attach(comp_ptr, size, comp_ptr->sm_ctl_file_name,
 537                               sizeof(mca_common_sm_seg_header_t),
 538                               opal_cache_line_size, &comp_ptr->sm_seg))) {
 539             /* rc is set */
 540             goto out;
 541         }
 542         fname = comp_ptr->sm_rndv_file_name;
 543         tmp_modp = comp_ptr->sm_seg;
 544     }
 545     else {
 546         return OPAL_ERR_BAD_PARAM;
 547     }
 548 
 549     /* at this point, we have all the info we need to populate the rendezvous
 550      * file containing all the meta info required for attach. */
 551 
 552     /* now just write the contents of tmp_modp->shmem_ds to the full
 553      * sizeof(opal_shmem_ds_t), so we know where the mpool_res_size starts. */
 554     if (-1 == (fd = open(fname, O_CREAT | O_RDWR, 0600))) {
 555         int err = errno;
 556         opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 557                        "open(2)", strerror(err), err);
 558         rc = OPAL_ERR_IN_ERRNO;
 559         goto out;
 560     }
 561     if ((ssize_t)sizeof(opal_shmem_ds_t) != write(fd, &(tmp_modp->shmem_ds),
 562                                                   sizeof(opal_shmem_ds_t))) {
 563         int err = errno;
 564         opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 565                        "write(2)", strerror(err), err);
 566         rc = OPAL_ERR_IN_ERRNO;
 567         goto out;
 568     }
 569     if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
 570         if ((ssize_t)sizeof(size) != write(fd, &size, sizeof(size))) {
 571             int err = errno;
 572             opal_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
 573                            "write(2)", strerror(err), err);
 574             rc = OPAL_ERR_IN_ERRNO;
 575             goto out;
 576         }
 577         /* only do this for the mpool case */
 578         OBJ_RELEASE(tmp_modp);
 579     }
 580 
 581 out:
 582     if (-1 != fd) {
 583         (void)close(fd);
 584     }
 585     return rc;
 586 }
 587 
 588 /*
 589  * Creates information required for the sm modex and modex sends it.
 590  */
 591 static int
 592 backing_store_init(mca_btl_smcuda_component_t *comp_ptr,
 593                    uint32_t local_rank)
 594 {
 595     int rc = OPAL_SUCCESS;
 596 
 597     if (OPAL_SUCCESS != (rc = set_uniq_paths_for_init_rndv(comp_ptr))) {
 598         goto out;
 599     }
 600     /* only let the lowest rank setup the metadata */
 601     if (0 == local_rank) {
 602         /* === sm mpool === */
 603         if (OPAL_SUCCESS != (rc =
 604             create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_MPOOL))) {
 605             goto out;
 606         }
 607         /* === sm === */
 608         if (OPAL_SUCCESS != (rc =
 609             create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_SM))) {
 610             goto out;
 611         }
 612     }
 613 
 614 out:
 615     return rc;
 616 }
 617 
 618 #if OPAL_CUDA_SUPPORT
 619 
 620 /**
 621  * Send a CUDA IPC ACK or NOTREADY message back to the peer.
 622  *
 623  * @param btl (IN)      BTL module
 624  * @param peer (IN)     BTL peer addressing
 625  * @param peer (IN)     If ready, then send ACK
 626  */
 627 static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl,
 628                                              struct mca_btl_base_endpoint_t* endpoint, int ready)
 629 {
 630     mca_btl_smcuda_frag_t* frag;
 631     ctrlhdr_t ctrlhdr;
 632     int rc;
 633 
 634     if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
 635         mca_btl_smcuda_component_progress();
 636     }
 637 
 638     /* allocate a fragment, giving up if we can't get one */
 639     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
 640     if( OPAL_UNLIKELY(NULL == frag) ) {
 641         endpoint->ipcstate = IPC_BAD;
 642         return;
 643     }
 644 
 645     if (ready) {
 646         ctrlhdr.ctag = IPC_ACK;
 647     } else {
 648         ctrlhdr.ctag = IPC_NOTREADY;
 649     }
 650 
 651     /* Fill in fragment fields. */
 652     frag->hdr->tag = MCA_BTL_TAG_SMCUDA;
 653     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
 654     frag->endpoint = endpoint;
 655     memcpy(frag->segment.seg_addr.pval, &ctrlhdr, sizeof(struct ctrlhdr_st));
 656 
 657     /* write the fragment pointer to the FIFO */
 658     /*
 659      * Note that we don't care what the FIFO-write return code is.  Even if
 660      * the return code indicates failure, the write has still "completed" from
 661      * our point of view:  it has been posted to a "pending send" queue.
 662      */
 663     OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
 664 
 665     MCA_BTL_SMCUDA_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
 666                               endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
 667 
 668     /* Set state now that we have sent message */
 669     if (ready) {
 670         endpoint->ipcstate = IPC_ACKED;
 671     } else {
 672         endpoint->ipcstate = IPC_INIT;
 673     }
 674 
 675     return;
 676 
 677 }
 678 /* This function is utilized to set up CUDA IPC support within the smcuda
 679  * BTL.  It handles smcuda specific control messages that are triggered
 680  * when GPU memory transfers are initiated. */
 681 static void btl_smcuda_control(mca_btl_base_module_t* btl,
 682                                mca_btl_base_tag_t tag,
 683                                mca_btl_base_descriptor_t* des, void* cbdata)
 684 {
 685     int mydevnum, ipcaccess, res;
 686     ctrlhdr_t ctrlhdr;
 687     opal_proc_t *ep_proc;
 688     struct mca_btl_base_endpoint_t *endpoint;
 689     mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
 690     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
 691     mca_btl_base_segment_t* segments = des->des_segments;
 692 
 693     /* Use the rank of the peer that sent the data to get to the endpoint
 694      * structure.  This is needed for PML callback. */
 695     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
 696     ep_proc = endpoint->proc_opal;
 697 
 698     /* Copy out control message payload to examine it */
 699     memcpy(&ctrlhdr, segments->seg_addr.pval, sizeof(struct ctrlhdr_st));
 700 
 701     /* Handle an incoming CUDA IPC control message. */
 702     switch (ctrlhdr.ctag) {
 703     case IPC_REQ:
 704         /* Initial request to set up IPC.  If the state of IPC
 705          * initialization is IPC_INIT, then check on the peer to peer
 706          * access and act accordingly.  If we are in the IPC_SENT
 707          * state, then this means both sides are trying to set up the
 708          * connection.  If my smp rank is higher then check and act
 709          * accordingly.  Otherwise, drop the request and let the other
 710          * side continue the handshake. */
 711         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
 712         if ((IPC_INIT == endpoint->ipcstate) ||
 713             ((IPC_SENT == endpoint->ipcstate) && (endpoint->my_smp_rank > endpoint->peer_smp_rank))) {
 714             endpoint->ipcstate = IPC_ACKING; /* Move into new state to prevent any new connection attempts */
 715             OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 716 
 717             /* If not yet CUDA ready, send a NOTREADY message back. */
 718             if (!mca_common_cuda_enabled) {
 719                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 720                                     "Sending CUDA IPC NOTREADY: myrank=%d, peerrank=%d",
 721                                     mca_btl_smcuda_component.my_smp_rank,
 722                                     endpoint->peer_smp_rank);
 723                 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 0);
 724                 return;
 725             }
 726 
 727             /* Get my current device.  If this fails, move this endpoint state into
 728              * bad state.  No need to send a reply.  */
 729             res = mca_common_cuda_get_device(&mydevnum);
 730             if (0 != res) {
 731                 endpoint->ipcstate = IPC_BAD;
 732                 return;
 733             }
 734 
 735             /* Check for IPC support between devices. If they are the
 736              * same device and use_cuda_ipc_same_gpu is 1 (default),
 737              * then assume CUDA IPC is possible.  This could be a
 738              * device running in DEFAULT mode or running under MPS.
 739              * Otherwise, check peer acces to determine CUDA IPC
 740              * support.  If the CUDA API call fails, then just move
 741              * endpoint into bad state.  No need to send a reply. */
 742             if (mydevnum == ctrlhdr.cudev) {
 743                 if (mca_btl_smcuda_component.use_cuda_ipc_same_gpu) {
 744                     ipcaccess = 1;
 745                 } else {
 746                     opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 747                                         "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 748                                         "peerdev=%d --> Access is disabled by btl_smcuda_use_cuda_ipc_same_gpu",
 749                                         endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 750                                         ctrlhdr.cudev);
 751                     endpoint->ipcstate = IPC_BAD;
 752                     return;
 753                 }
 754             } else {
 755                 res = mca_common_cuda_device_can_access_peer(&ipcaccess, mydevnum, ctrlhdr.cudev);
 756                 if (0 != res) {
 757                     opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 758                                         "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 759                                         "peerdev=%d --> Access is disabled because peer check failed with err=%d",
 760                                         endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 761                                         ctrlhdr.cudev, res);
 762                     endpoint->ipcstate = IPC_BAD;
 763                     return;
 764                 }
 765             }
 766 
 767             assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank);
 768             opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 769                                 "Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
 770                                 "peerdev=%d --> ACCESS=%d",
 771                                 endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 772                                 ctrlhdr.cudev, ipcaccess);
 773 
 774             if (0 == ipcaccess) {
 775                 /* No CUDA IPC support */
 776                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 777                                     "Not sending CUDA IPC ACK, no P2P support");
 778                 endpoint->ipcstate = IPC_BAD;
 779             } else {
 780                 /* CUDA IPC works */
 781                 smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
 782                                      ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
 783                 opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 784                                     "Sending CUDA IPC ACK:  myrank=%d, mydev=%d, peerrank=%d, peerdev=%d",
 785                                     endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
 786                                     ctrlhdr.cudev);
 787                 mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 1);
 788             }
 789         } else {
 790             OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 791             opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 792                                 "Not sending CUDA IPC ACK because request already initiated");
 793         }
 794         break;
 795 
 796     case IPC_ACK:
 797         opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 798                             "Received CUDA IPC ACK, notifying PML: myrank=%d, peerrank=%d",
 799                             endpoint->my_smp_rank, endpoint->peer_smp_rank);
 800 
 801         smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC,
 802                              ep_proc, (char *)&mca_btl_smcuda_component.cuda_ipc_output);
 803         assert(endpoint->ipcstate == IPC_SENT);
 804         endpoint->ipcstate = IPC_ACKED;
 805         break;
 806 
 807     case IPC_NOTREADY:
 808         /* The remote side is not ready.  Reset state to initialized so next
 809          * send call will try again to set up connection. */
 810         opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
 811                             "Received CUDA IPC NOTREADY, reset state to allow another attempt: "
 812                             "myrank=%d, peerrank=%d",
 813                             endpoint->my_smp_rank, endpoint->peer_smp_rank);
 814         OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
 815         if (IPC_SENT == endpoint->ipcstate) {
 816             endpoint->ipcstate = IPC_INIT;
 817         }
 818         OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 819         break;
 820 
 821     default:
 822         opal_output(0, "Received UNKNOWN CUDA IPC control message. This should not happen.");
 823     }
 824 }
 825 
 826 #endif /* OPAL_CUDA_SUPPORT */
 827 
 828 /*
 829  *  SM component initialization
 830  */
 831 static mca_btl_base_module_t **
 832 mca_btl_smcuda_component_init(int *num_btls,
 833                           bool enable_progress_threads,
 834                           bool enable_mpi_threads)
 835 {
 836     int num_local_procs = 0;
 837     mca_btl_base_module_t **btls = NULL;
 838     uint32_t my_local_rank = UINT32_MAX;
 839 
 840     *num_btls = 0;
 841     /* lookup/create shared memory pool only when used */
 842     mca_btl_smcuda_component.sm_mpool = NULL;
 843     mca_btl_smcuda_component.sm_mpool_base = NULL;
 844 
 845 #if OPAL_CUDA_SUPPORT
 846     mca_common_cuda_stage_one_init();
 847 #endif /* OPAL_CUDA_SUPPORT */
 848 
 849     /* if no session directory was created, then we cannot be used */
 850     if (NULL == opal_process_info.job_session_dir) {
 851     /* SKG - this isn't true anymore. Some backing facilities don't require a
 852      * file-backed store. Extend shmem to provide this info one day. Especially
 853      * when we use a proper modex for init. */
 854         return NULL;
 855     }
 856     /* if we don't have locality information, then we cannot be used because we
 857      * need to know who the respective node ranks for initialization. note the
 858      * use of my_local_rank here. we use this instead of my_node_rank because in
 859      * the spawn case we need to designate a metadata creator rank within the
 860      * set of processes that are initializing the btl, and my_local_rank seems
 861      * to provide that for us. */
 862     if (UINT32_MAX ==
 863         (my_local_rank = opal_process_info.my_local_rank)) {
 864         opal_show_help("help-mpi-btl-smcuda.txt", "no locality", true);
 865         return NULL;
 866     }
 867     /* no use trying to use sm with less than two procs, so just bail. */
 868     if ((num_local_procs = get_num_local_procs()) < 2) {
 869         return NULL;
 870     }
 871     /* calculate max procs so we can figure out how large to make the
 872      * shared-memory segment. this routine sets component sm_max_procs. */
 873     calc_sm_max_procs(num_local_procs);
 874 
 875     /* This is where the modex will live some day. For now, just have local rank
 876      * 0 create a rendezvous file containing the backing store info, so the
 877      * other local procs can read from it during add_procs. The rest will just
 878      * stash the known paths for use later in init. */
 879     if (OPAL_SUCCESS != backing_store_init(&mca_btl_smcuda_component,
 880                                            my_local_rank)) {
 881         return NULL;
 882     }
 883 
 884 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 885     /* create a named pipe to receive events  */
 886     sprintf( mca_btl_smcuda_component.sm_fifo_path,
 887              "%s"OPAL_PATH_SEP"sm_fifo.%lu", opal_process_info.job_session_dir,
 888              (unsigned long)OPAL_PROC_MY_NAME->vpid );
 889     if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
 890         opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
 891         return NULL;
 892     }
 893     mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path,
 894                                            O_RDWR);
 895     if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
 896         opal_output(0, "mca_btl_smcuda_component_init: "
 897                    "open(%s) failed with errno=%d\n",
 898                     mca_btl_smcuda_component.sm_fifo_path, errno);
 899         return NULL;
 900     }
 901 
 902     OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
 903     mca_btl_smcuda_component.sm_fifo_thread.t_run =
 904         (opal_thread_fn_t)mca_btl_smcuda_component_event_thread;
 905     opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
 906 #endif
 907 
 908     mca_btl_smcuda_component.sm_btls =
 909         (mca_btl_smcuda_t **)malloc(mca_btl_smcuda_component.sm_max_btls *
 910                                 sizeof(mca_btl_smcuda_t *));
 911     if (NULL == mca_btl_smcuda_component.sm_btls) {
 912         return NULL;
 913     }
 914 
 915     /* allocate the Shared Memory BTL */
 916     *num_btls = 1;
 917     btls = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*));
 918     if (NULL == btls) {
 919         return NULL;
 920     }
 921 
 922     /* get pointer to the btls */
 923     btls[0] = (mca_btl_base_module_t*)(&(mca_btl_smcuda));
 924     mca_btl_smcuda_component.sm_btls[0] = (mca_btl_smcuda_t*)(&(mca_btl_smcuda));
 925 
 926     /* initialize some BTL data */
 927     /* start with no SM procs */
 928     mca_btl_smcuda_component.num_smp_procs = 0;
 929     mca_btl_smcuda_component.my_smp_rank   = -1;  /* not defined */
 930     mca_btl_smcuda_component.sm_num_btls   = 1;
 931     /* set flag indicating btl not inited */
 932     mca_btl_smcuda.btl_inited = false;
 933 
 934 #if OPAL_CUDA_SUPPORT
 935     /* Assume CUDA GET works. */
 936     mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
 937     /* Register a smcuda control function to help setup IPC support */
 938     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
 939     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
 940 #endif /* OPAL_CUDA_SUPPORT */
 941 
 942     return btls;
 943 
 944 }
 945 
 946 
 947 /*
 948  *  SM component progress.
 949  */
 950 
 951 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 952 void mca_btl_smcuda_component_event_thread(opal_object_t* thread)
 953 {
 954     while(1) {
 955         unsigned char cmd;
 956         if(read(mca_btl_smcuda_component.sm_fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) {
 957             /* error condition */
 958             return;
 959         }
 960         if( DONE == cmd ){
 961             /* return when done message received */
 962             return;
 963         }
 964         mca_btl_smcuda_component_progress();
 965     }
 966 }
 967 #endif
 968 
 969 void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep)
 970 {
 971     btl_smcuda_pending_send_item_t *si;
 972     int rc;
 973 
 974     while ( 0 < opal_list_get_size(&ep->pending_sends) ) {
 975         /* Note that we access the size of ep->pending_sends unlocked
 976            as it doesn't really matter if the result is wrong as
 977            opal_list_remove_first is called with a lock and we handle it
 978            not finding an item to process */
 979         OPAL_THREAD_LOCK(&ep->endpoint_lock);
 980         si = (btl_smcuda_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
 981         OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
 982 
 983         if(NULL == si) return; /* Another thread got in before us. Thats ok. */
 984 
 985         OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_pending_sends, -1);
 986 
 987         MCA_BTL_SMCUDA_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
 988                           true, false, rc);
 989 
 990         opal_free_list_return (&mca_btl_smcuda_component.pending_send_fl, (opal_free_list_item_t*)si);
 991 
 992         if ( OPAL_SUCCESS != rc )
 993             return;
 994     }
 995 }
 996 
 997 int mca_btl_smcuda_component_progress(void)
 998 {
 999     /* local variables */
1000     mca_btl_base_segment_t seg;
1001     mca_btl_smcuda_frag_t *frag;
1002     mca_btl_smcuda_frag_t Frag;
1003     sm_fifo_t *fifo = NULL;
1004     mca_btl_smcuda_hdr_t *hdr;
1005     int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
1006     int peer_smp_rank, j, rc = 0, nevents = 0;
1007 
1008     /* first, deal with any pending sends */
1009     /* This check should be fast since we only need to check one variable. */
1010     if ( 0 < mca_btl_smcuda_component.num_pending_sends ) {
1011 
1012         /* perform a loop to find the endpoints that have pending sends */
1013         /* This can take a while longer if there are many endpoints to check. */
1014         for ( peer_smp_rank = 0; peer_smp_rank < mca_btl_smcuda_component.num_smp_procs; peer_smp_rank++) {
1015             struct mca_btl_base_endpoint_t* endpoint;
1016             if ( peer_smp_rank == my_smp_rank )
1017                 continue;
1018             endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank];
1019             if ( 0 < opal_list_get_size(&endpoint->pending_sends) )
1020                 btl_smcuda_process_pending_sends(endpoint);
1021         }
1022     }
1023 
1024     /* poll each fifo */
1025     for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
1026         fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
1027       recheck_peer:
1028         /* aquire thread lock */
1029         if(opal_using_threads()) {
1030             opal_atomic_lock(&(fifo->tail_lock));
1031         }
1032 
1033         hdr = (mca_btl_smcuda_hdr_t *)sm_fifo_read(fifo);
1034 
1035         /* release thread lock */
1036         if(opal_using_threads()) {
1037             opal_atomic_unlock(&(fifo->tail_lock));
1038         }
1039 
1040         if(SM_FIFO_FREE == hdr) {
1041             continue;
1042         }
1043 
1044         nevents++;
1045         /* dispatch fragment by type */
1046         switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
1047             case MCA_BTL_SMCUDA_FRAG_SEND:
1048             {
1049                 mca_btl_active_message_callback_t* reg;
1050                 /* change the address from address relative to the shared
1051                  * memory address, to a true virtual address */
1052                 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1053                 peer_smp_rank = hdr->my_smp_rank;
1054 #if OPAL_ENABLE_DEBUG
1055                 if ( FIFO_MAP(peer_smp_rank) != j ) {
1056                     opal_output(0, "mca_btl_smcuda_component_progress: "
1057                                 "rank %d got %d on FIFO %d, but this sender should send to FIFO %d\n",
1058                                 my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
1059                 }
1060 #endif
1061                 /* recv upcall */
1062                 reg = mca_btl_base_active_message_trigger + hdr->tag;
1063                 seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
1064                 seg.seg_len = hdr->len;
1065                 Frag.base.des_segment_count = 1;
1066                 Frag.base.des_segments = &seg;
1067 #if OPAL_CUDA_SUPPORT
1068                 Frag.hdr = hdr;  /* needed for peer rank in control messages */
1069 #endif /* OPAL_CUDA_SUPPORT */
1070                 reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
1071                             reg->cbdata);
1072                 /* return the fragment */
1073                 MCA_BTL_SMCUDA_FIFO_WRITE(
1074                         mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1075                         my_smp_rank, peer_smp_rank, hdr->frag, false, true, rc);
1076                 break;
1077             }
1078         case MCA_BTL_SMCUDA_FRAG_ACK:
1079             {
1080                 int status = (uintptr_t)hdr & MCA_BTL_SMCUDA_FRAG_STATUS_MASK;
1081                 int btl_ownership;
1082                 struct mca_btl_base_endpoint_t* endpoint;
1083 
1084                 frag = (mca_btl_smcuda_frag_t *)((char*)((uintptr_t)hdr &
1085                                                      (~(MCA_BTL_SMCUDA_FRAG_TYPE_MASK |
1086                                                         MCA_BTL_SMCUDA_FRAG_STATUS_MASK))));
1087 
1088                 endpoint = frag->endpoint;
1089                 btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
1090                 if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
1091                     /* completion callback */
1092                     frag->base.des_cbfunc(&mca_btl_smcuda.super, frag->endpoint,
1093                                           &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
1094                 }
1095                 if( btl_ownership ) {
1096                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1097                 }
1098                 OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
1099                 if ( 0 < opal_list_get_size(&endpoint->pending_sends) ) {
1100                     btl_smcuda_process_pending_sends(endpoint);
1101                 }
1102                 goto recheck_peer;
1103             }
1104             default:
1105                 /* unknown */
1106                 /*
1107                  * This code path should presumably never be called.
1108                  * It's unclear if it should exist or, if so, how it should be written.
1109                  * If we want to return it to the sending process,
1110                  * we have to figure out who the sender is.
1111                  * It seems we need to subtract the mask bits.
1112                  * Then, hopefully this is an sm header that has an smp_rank field.
1113                  * Presumably that means the received header was relative.
1114                  * Or, maybe this code should just be removed.
1115                  */
1116                 opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1117                 hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1118                 peer_smp_rank = hdr->my_smp_rank;
1119                 hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1120                         MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1121                 MCA_BTL_SMCUDA_FIFO_WRITE(
1122                         mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1123                         my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1124                 break;
1125         }
1126     }
1127     (void)rc; /* this is safe to ignore as the message is requeued till success */
1128 
1129 #if OPAL_CUDA_SUPPORT
1130     /* Check to see if there are any outstanding CUDA events that have
1131      * completed.  If so, issue the PML callbacks on the fragments.
1132      */
1133     while (1 == progress_one_cuda_ipc_event((mca_btl_base_descriptor_t **)&frag)) {
1134         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag->base.des_cbfunc;
1135 
1136         cbfunc (&mca_btl_smcuda.super, frag->endpoint, frag->segment.seg_addr.pval,
1137                 frag->local_handle, frag->base.des_context, frag->base.des_cbdata,
1138                 OPAL_SUCCESS);
1139 
1140         if(frag->registration != NULL) {
1141             frag->endpoint->rcache->rcache_deregister (frag->endpoint->rcache,
1142                                                        (mca_rcache_base_registration_t*)frag->registration);
1143             frag->registration = NULL;
1144             MCA_BTL_SMCUDA_FRAG_RETURN(frag);
1145         }
1146         nevents++;
1147     }
1148 #endif /* OPAL_CUDA_SUPPORT */
1149     return nevents;
1150 }
/* [<][>][^][v][top][bottom][index][help] */
root/opal/mca/btl/smcuda/btl_smcuda_component.c

DEFINITIONS