root/opal/mca/btl/smcuda/btl_smcuda.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. sm_fifo_init
  2. sm_fifo_write
  3. sm_fifo_read

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2009 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2007 Voltaire. All rights reserved.
  14  * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
  15  * Copyright (c) 2010-2015 Los Alamos National Security, LLC.
  16  *                         All rights reserved.
  17  * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 /**
  25  * @file
  26  */
  27 #ifndef MCA_BTL_SMCUDA_H
  28 #define MCA_BTL_SMCUDA_H
  29 
  30 #include "opal_config.h"
  31 #include <stddef.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <stdint.h>
  35 #ifdef HAVE_SCHED_H
  36 #include <sched.h>
  37 #endif  /* HAVE_SCHED_H */
  38 
  39 #include "opal/util/bit_ops.h"
  40 #include "opal/class/opal_free_list.h"
  41 #include "opal/mca/btl/btl.h"
  42 #include "opal/mca/common/sm/common_sm.h"
  43 
  44 BEGIN_C_DECLS
  45 
  46 /*
  47  * Shared Memory FIFOs
  48  *
  49  * The FIFO is implemented as a circular queue with head and tail pointers
  50  * (integer indices).  For efficient wraparound indexing, the size of the
  51  * queue is constrained to be a power of two and we "&" indices with a "mask".
  52  *
  53  * More than one process can write to the FIFO head.  Therefore, there is a head
  54  * lock.  One cannot write until the head slot is empty, indicated by the special
  55  * queue entry SM_FIFO_FREE.
  56  *
  57  * Only the receiver can read the FIFO tail.  Therefore, the tail lock is
  58  * required only in multithreaded applications.  If a tail read returns the
  59  * SM_FIFO_FREE value, that means the FIFO is empty.  Once a non-FREE value
  60  * has been read, the queue slot is *not* automatically reset to SM_FIFO_FREE.
  61  * Rather, read tail slots are reset "lazily" (see "lazy_free" and "num_to_clear")
  62  * to reduce the number of memory barriers and improve performance.
  63  *
  64  * Since the FIFO lives in shared memory that is mapped differently into
  65  * each address space, the "queue" pointer is relative (each process must
  66  * add its own offset) and the queue_recv pointer is meaningful only in the
  67  * receiver's address space.
  68  *
  69  * Since multiple processes access different parts of the FIFO structure in
  70  * different ways, we introduce padding to keep different parts on different
  71  * cachelines.
  72  */
  73 
  74 #define SM_FIFO_FREE  (void *) (-2)
  75 /* We can't use opal_cache_line_size here because we need a
  76    compile-time constant for padding the struct.  We can't really have
  77    a compile-time constant that is portable, either (e.g., compile on
  78    one machine and run on another).  So just use a big enough cache
  79    line that should hopefully be good in most places. */
  80 #define SM_CACHE_LINE_PAD 128
  81 
  82 struct sm_fifo_t {
  83     /* This queue pointer is used only by the heads. */
  84     volatile void **queue;
  85     char pad0[SM_CACHE_LINE_PAD - sizeof(void **)];
  86     /* This lock is used by the heads. */
  87     opal_atomic_lock_t head_lock;
  88     char pad1[SM_CACHE_LINE_PAD - sizeof(opal_atomic_lock_t)];
  89     /* This index is used by the head holding the head lock. */
  90     volatile int head;
  91     char pad2[SM_CACHE_LINE_PAD - sizeof(int)];
  92     /* This mask is used "read only" by all processes. */
  93     unsigned int mask;
  94     char pad3[SM_CACHE_LINE_PAD - sizeof(int)];
  95     /* The following are used only by the tail. */
  96     volatile void **queue_recv;
  97     opal_atomic_lock_t tail_lock;
  98     volatile int tail;
  99     int num_to_clear;
 100     int lazy_free;
 101     char pad4[SM_CACHE_LINE_PAD - sizeof(void **) -
 102               sizeof(opal_atomic_lock_t) -
 103               sizeof(int) * 3];
 104 };
 105 typedef struct sm_fifo_t sm_fifo_t;
 106 
 107 /*
 108  * Shared Memory resource managment
 109  */
 110 
 111 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 112 #define DATA (char)0
 113 #define DONE (char)1
 114 #endif
 115 
 116 typedef struct mca_btl_smcuda_mem_node_t {
 117     mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
 118 } mca_btl_smcuda_mem_node_t;
 119 
 120 /**
 121  * Shared Memory (SM) BTL module.
 122  */
 123 struct mca_btl_smcuda_component_t {
 124     mca_btl_base_component_2_0_0_t super;  /**< base BTL component */
 125     int sm_free_list_num;              /**< initial size of free lists */
 126     int sm_free_list_max;              /**< maximum size of free lists */
 127     int sm_free_list_inc;              /**< number of elements to alloc when growing free lists */
 128     int sm_max_procs;                  /**< upper limit on the number of processes using the shared memory pool */
 129     int sm_extra_procs;                /**< number of extra procs to allow */
 130     char* sm_mpool_name;               /**< name of shared memory pool module */
 131     mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node) */
 132     mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */
 133     void* sm_mpool_base;               /**< base address of shared memory pool */
 134     size_t eager_limit;                /**< first fragment size */
 135     size_t max_frag_size;              /**< maximum (second and beyone) fragment size */
 136     opal_mutex_t sm_lock;
 137     mca_common_sm_module_t *sm_seg;   /**< description of shared memory segment */
 138     volatile sm_fifo_t **shm_fifo;     /**< pointer to fifo 2D array in shared memory */
 139     char **shm_bases;                  /**< pointer to base pointers in shared memory */
 140     uint16_t *shm_mem_nodes;           /**< pointer to mem noded in shared memory */
 141     sm_fifo_t **fifo;                  /**< cached copy of the pointer to the 2D
 142                                           fifo array.  The address in the shared
 143                                           memory segment sm_ctl_header is a relative,
 144                                           but this one, in process private memory, is
 145                                           a real virtual address */
 146     uint16_t *mem_nodes;               /**< cached copy of mem nodes of each local rank */
 147     unsigned int fifo_size;            /**< number of FIFO queue entries */
 148     unsigned int fifo_lazy_free;       /**< number of reads before lazy fifo free is triggered */
 149     int nfifos;                        /**< number of FIFOs per receiver */
 150     int32_t num_smp_procs;             /**< current number of smp procs on this host */
 151     int32_t my_smp_rank;               /**< My SMP process rank.  Used for accessing
 152                                         *   SMP specfic data structures. */
 153     opal_free_list_t sm_frags_eager;   /**< free list of sm first */
 154     opal_free_list_t sm_frags_max;     /**< free list of sm second */
 155     opal_free_list_t sm_frags_user;
 156     opal_free_list_t sm_first_frags_to_progress;  /**< list of first
 157                                                     fragments that are
 158                                                     awaiting resources */
 159     struct mca_btl_base_endpoint_t **sm_peers;
 160 
 161     opal_free_list_t pending_send_fl;
 162     opal_atomic_int32_t num_outstanding_frags;         /**< number of fragments sent but not yet returned to free list */
 163     opal_atomic_int32_t num_pending_sends;             /**< total number on all of my pending-send queues */
 164     int mem_node;
 165     int num_mem_nodes;
 166 
 167 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 168     char sm_fifo_path[PATH_MAX];   /**< path to fifo used to signal this process */
 169     int  sm_fifo_fd;               /**< file descriptor corresponding to opened fifo */
 170     opal_thread_t sm_fifo_thread;
 171 #endif
 172     struct mca_btl_smcuda_t      **sm_btls;
 173     struct mca_btl_smcuda_frag_t **table;
 174     size_t sm_num_btls;
 175     size_t sm_max_btls;
 176 
 177 
 178     /** MCA: should we be using knem or not?  neg=try but continue if
 179         not available, 0=don't try, 1=try and fail if not available */
 180     int use_knem;
 181 
 182     /** MCA: minimal message size (bytes) to offload on DMA engine
 183         when using knem */
 184     unsigned int knem_dma_min;
 185 
 186     /** MCA: how many simultaneous ongoing knem operations to
 187         support */
 188     int knem_max_simultaneous;
 189 
 190     /** If we want DMA and DMA is supported, this will be loaded with
 191         KNEM_FLAG_DMA.  Otherwise, it'll be 0. */
 192     int knem_dma_flag;
 193 
 194     /** MCA: should we be using CMA or not?
 195         0 = no, 1 = yes */
 196     int use_cma;
 197 
 198     /* /// well-known file names for sm and sm mpool init /// */
 199     char *sm_mpool_ctl_file_name;
 200     char *sm_mpool_rndv_file_name;
 201     char *sm_ctl_file_name;
 202     char *sm_rndv_file_name;
 203 #if OPAL_CUDA_SUPPORT
 204     int cuda_ipc_verbose;
 205     int cuda_ipc_output;
 206     int use_cuda_ipc;
 207     int use_cuda_ipc_same_gpu;
 208 #endif /* OPAL_CUDA_SUPPORT */
 209     unsigned long mpool_min_size;
 210     char *allocator;
 211 };
 212 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
 213 OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
 214 
 215 /**
 216  * SM BTL Interface
 217  */
 218 struct mca_btl_smcuda_t {
 219     mca_btl_base_module_t  super;       /**< base BTL interface */
 220     bool btl_inited;  /**< flag indicating if btl has been inited */
 221     mca_btl_base_module_error_cb_fn_t error_cb;
 222     mca_rcache_base_module_t *rcache;
 223 };
 224 typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
 225 OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;
 226 
 227 struct btl_smcuda_pending_send_item_t
 228 {
 229     opal_free_list_item_t super;
 230     void *data;
 231 };
 232 typedef struct btl_smcuda_pending_send_item_t btl_smcuda_pending_send_item_t;
 233 
 234 /***
 235  * FIFO support for sm BTL.
 236  */
 237 
 238 /***
 239  * One or more FIFO components may be a pointer that must be
 240  * accessed by multiple processes.  Since the shared region may
 241  * be mmapped differently into each process's address space,
 242  * these pointers will be relative to some base address.  Here,
 243  * we define macros to translate between relative addresses and
 244  * virtual addresses.
 245  */
 246 #define VIRTUAL2RELATIVE(VADDR ) ((long)(VADDR)  - (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
 247 #define RELATIVE2VIRTUAL(OFFSET) ((long)(OFFSET) + (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
 248 
 249 static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
 250                                sm_fifo_t *fifo, int lazy_free)
 251 {
 252     int i, qsize;
 253 
 254     /* figure out the queue size (a power of two that is at least 1) */
 255     qsize = opal_next_poweroftwo_inclusive (fifo_size);
 256 
 257     /* allocate the queue in the receiver's address space */
 258     fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
 259             mpool, sizeof(void *) * qsize, opal_cache_line_size, 0);
 260     if(NULL == fifo->queue_recv) {
 261         return OPAL_ERR_OUT_OF_RESOURCE;
 262     }
 263 
 264     /* initialize the queue */
 265     for ( i = 0; i < qsize; i++ )
 266         fifo->queue_recv[i] = SM_FIFO_FREE;
 267 
 268     /* shift queue address to be relative */
 269     fifo->queue = (volatile void **) VIRTUAL2RELATIVE(fifo->queue_recv);
 270 
 271     /* initialize the locks */
 272     opal_atomic_lock_init(&(fifo->head_lock), OPAL_ATOMIC_LOCK_UNLOCKED);
 273     opal_atomic_lock_init(&(fifo->tail_lock), OPAL_ATOMIC_LOCK_UNLOCKED);
 274     opal_atomic_unlock(&(fifo->head_lock));  /* should be unnecessary */
 275     opal_atomic_unlock(&(fifo->tail_lock));  /* should be unnecessary */
 276 
 277     /* other initializations */
 278     fifo->head = 0;
 279     fifo->mask = qsize - 1;
 280     fifo->tail = 0;
 281     fifo->num_to_clear = 0;
 282     fifo->lazy_free = lazy_free;
 283 
 284     return OPAL_SUCCESS;
 285 }
 286 
 287 
 288 static inline int sm_fifo_write(void *value, sm_fifo_t *fifo)
 289 {
 290     volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo->queue);
 291 
 292     /* if there is no free slot to write, report exhausted resource */
 293     opal_atomic_rmb();
 294     if ( SM_FIFO_FREE != q[fifo->head] )
 295         return OPAL_ERR_OUT_OF_RESOURCE;
 296 
 297     /* otherwise, write to the slot and advance the head index */
 298     q[fifo->head] = value;
 299     opal_atomic_wmb();
 300     fifo->head = (fifo->head + 1) & fifo->mask;
 301     return OPAL_SUCCESS;
 302 }
 303 
 304 
 305 static inline void *sm_fifo_read(sm_fifo_t *fifo)
 306 {
 307     void *value;
 308 
 309     /* read the next queue entry */
 310     value = (void *) fifo->queue_recv[fifo->tail];
 311 
 312     opal_atomic_rmb();
 313 
 314     /* if you read a non-empty slot, advance the tail pointer */
 315     if ( SM_FIFO_FREE != value ) {
 316 
 317         fifo->tail = ( fifo->tail + 1 ) & fifo->mask;
 318         fifo->num_to_clear += 1;
 319 
 320         /* check if it's time to free slots, which we do lazily */
 321         if ( fifo->num_to_clear >= fifo->lazy_free ) {
 322             int i = (fifo->tail - fifo->num_to_clear ) & fifo->mask;
 323 
 324             while ( fifo->num_to_clear > 0 ) {
 325                 fifo->queue_recv[i] = SM_FIFO_FREE;
 326                 i = (i+1) & fifo->mask;
 327                 fifo->num_to_clear -= 1;
 328             }
 329             opal_atomic_wmb();
 330         }
 331     }
 332 
 333     return value;
 334 }
 335 
 336 /**
 337  * shared memory component progress.
 338  */
 339 extern int mca_btl_smcuda_component_progress(void);
 340 
 341 
 342 
 343 /**
 344  * Register a callback function that is called on error..
 345  *
 346  * @param btl (IN)     BTL module
 347  * @return             Status indicating if cleanup was successful
 348  */
 349 
 350 int mca_btl_smcuda_register_error_cb(
 351     struct mca_btl_base_module_t* btl,
 352     mca_btl_base_module_error_cb_fn_t cbfunc
 353 );
 354 
 355 /**
 356  * Cleanup any resources held by the BTL.
 357  *
 358  * @param btl  BTL instance.
 359  * @return     OPAL_SUCCESS or error status on failure.
 360  */
 361 
 362 extern int mca_btl_smcuda_finalize(
 363     struct mca_btl_base_module_t* btl
 364 );
 365 
 366 
 367 /**
 368  * PML->BTL notification of change in the process list.
 369  * PML->BTL Notification that a receive fragment has been matched.
 370  * Called for message that is send from process with the virtual
 371  * address of the shared memory segment being different than that of
 372  * the receiver.
 373  *
 374  * @param btl (IN)
 375  * @param proc (IN)
 376  * @param peer (OUT)
 377  * @return     OPAL_SUCCESS or error status on failure.
 378  *
 379  */
 380 
 381 extern int mca_btl_smcuda_add_procs(
 382     struct mca_btl_base_module_t* btl,
 383     size_t nprocs,
 384     struct opal_proc_t **procs,
 385     struct mca_btl_base_endpoint_t** peers,
 386     struct opal_bitmap_t* reachability
 387 );
 388 
 389 
 390 /**
 391  * PML->BTL notification of change in the process list.
 392  *
 393  * @param btl (IN)     BTL instance
 394  * @param proc (IN)    Peer process
 395  * @param peer (IN)    Peer addressing information.
 396  * @return             Status indicating if cleanup was successful
 397  *
 398  */
 399 extern int mca_btl_smcuda_del_procs(
 400     struct mca_btl_base_module_t* btl,
 401     size_t nprocs,
 402     struct opal_proc_t **procs,
 403     struct mca_btl_base_endpoint_t **peers
 404 );
 405 
 406 
 407 /**
 408  * Allocate a segment.
 409  *
 410  * @param btl (IN)      BTL module
 411  * @param size (IN)     Request segment size.
 412  */
 413 extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
 414     struct mca_btl_base_module_t* btl,
 415     struct mca_btl_base_endpoint_t* endpoint,
 416     uint8_t order,
 417     size_t size,
 418     uint32_t flags
 419 );
 420 
 421 /**
 422  * Return a segment allocated by this BTL.
 423  *
 424  * @param btl (IN)      BTL module
 425  * @param segment (IN)  Allocated segment.
 426  */
 427 extern int mca_btl_smcuda_free(
 428     struct mca_btl_base_module_t* btl,
 429     mca_btl_base_descriptor_t* segment
 430 );
 431 
 432 
 433 /**
 434  * Pack data
 435  *
 436  * @param btl (IN)      BTL module
 437  * @param peer (IN)     BTL peer addressing
 438  */
 439 struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
 440     struct mca_btl_base_module_t* btl,
 441     struct mca_btl_base_endpoint_t* endpoint,
 442     struct opal_convertor_t* convertor,
 443     uint8_t order,
 444     size_t reserve,
 445     size_t* size,
 446     uint32_t flags
 447 );
 448 
 449 
 450 /**
 451  * Initiate an inlined send to the peer or return a descriptor.
 452  *
 453  * @param btl (IN)      BTL module
 454  * @param peer (IN)     BTL peer addressing
 455  */
 456 extern int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
 457                              struct mca_btl_base_endpoint_t* endpoint,
 458                              struct opal_convertor_t* convertor,
 459                              void* header,
 460                              size_t header_size,
 461                              size_t payload_size,
 462                              uint8_t order,
 463                              uint32_t flags,
 464                              mca_btl_base_tag_t tag,
 465                              mca_btl_base_descriptor_t** descriptor );
 466 
 467 /**
 468  * Initiate a send to the peer.
 469  *
 470  * @param btl (IN)      BTL module
 471  * @param peer (IN)     BTL peer addressing
 472  */
 473 extern int mca_btl_smcuda_send(
 474     struct mca_btl_base_module_t* btl,
 475     struct mca_btl_base_endpoint_t* endpoint,
 476     struct mca_btl_base_descriptor_t* descriptor,
 477     mca_btl_base_tag_t tag
 478 );
 479 
 480 #if OPAL_CUDA_SUPPORT
 481 /**
 482  * Remote get using device memory.
 483  */
 484 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
 485     struct mca_btl_base_endpoint_t *ep, void *local_address,
 486     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
 487     struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
 488     int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
 489 
 490 /* CUDA IPC control message tags */
 491 enum ipcCtrlMsg {
 492     IPC_REQ = 10,
 493     IPC_ACK,
 494     IPC_NOTREADY,
 495 };
 496 
 497 /* CUDA IPC control message */
 498 typedef struct ctrlhdr_st {
 499         enum ipcCtrlMsg ctag;
 500         int cudev;
 501 } ctrlhdr_t;
 502 
 503 /* State of setting up CUDA IPC on an endpoint */
 504 enum ipcState {
 505     IPC_INIT = 1,
 506     IPC_SENT,
 507     IPC_ACKING,
 508     IPC_ACKED,
 509     IPC_OK,
 510     IPC_BAD
 511 };
 512 
 513 #endif /* OPAL_CUDA_SUPPORT */
 514 
 515 
 516 extern void mca_btl_smcuda_dump(struct mca_btl_base_module_t* btl,
 517                             struct mca_btl_base_endpoint_t* endpoint,
 518                             int verbose);
 519 
 520 /**
 521  * Fault Tolerance Event Notification Function
 522  * @param state Checkpoint Stae
 523  * @return OPAL_SUCCESS or failure status
 524  */
 525 int mca_btl_smcuda_ft_event(int state);
 526 
 527 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 528 void mca_btl_smcuda_component_event_thread(opal_object_t*);
 529 #endif
 530 
 531 #if OPAL_ENABLE_PROGRESS_THREADS == 1
 532 #define MCA_BTL_SMCUDA_SIGNAL_PEER(peer) \
 533 { \
 534     unsigned char cmd = DATA; \
 535     if(write(peer->fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { \
 536         opal_output(0, "mca_btl_smcuda_send: write fifo failed: errno=%d\n", errno); \
 537     } \
 538 }
 539 #else
 540 #define MCA_BTL_SMCUDA_SIGNAL_PEER(peer)
 541 #endif
 542 
 543 END_C_DECLS
 544 
 545 #endif
 546 

/* [<][>][^][v][top][bottom][index][help] */