root/opal/mca/btl/btl.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2016 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2018 Los Alamos National Security, LLC.  All rights
  14  *                         reserved.
  15  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
  16  * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
  17  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
  18  * Copyright (c) 2015      Research Organization for Information Science
  19  *                         and Technology (RIST). All rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 /**
  27  * @file
  28  *
  29  * Byte Transfer Layer (BTL)
  30  *
  31  *
  32  * BTL Initialization:
  33  *
  34  * During library initialization, all available BTL components are
  35  * loaded and opened via their mca_base_open_component_fn_t
  36  * function. The BTL open function should register any mca parameters
  37  * used to tune/adjust the behaviour of the BTL (mca_base_var_register()
  38  * mca_base_component_var_register()). Note that the open function may fail
  39  * if the resources (e.g. shared libraries, etc) required by the network
  40  * transport are not available.
  41  *
  42  * The mca_btl_base_component_init_fn_t() is then called for each of the
  43  * components that are succesfully opened. The component init function may
  44  * return either:
  45  *
  46  * (1) a NULL list of BTL modules if the transport is not available,
  47  * (2) a list containing a one or more single BTL modules, where the BTL provides
  48  *     a layer of abstraction over one or more physical devices (e.g. NICs),
  49  *
  50  * During module initialization, the module should post any addressing
  51  * information required by its peers. An example would be the TCP
  52  * listen port opened by the TCP module for incoming connection
  53  * requests. This information is published to peers via the
  54  * modex_send() interface. Note that peer information is not
  55  * guaranteed to be available via modex_recv() during the
  56  * module's init function. However, it will be available during
  57  * BTL selection (mca_btl_base_add_proc_fn_t()).
  58  *
  59  * BTL Selection:
  60  *
  61  * The upper layer builds an ordered list of the available BTL modules sorted
  62  * by their exclusivity ranking. This is a relative ranking that is used
  63  * to determine the set of BTLs that may be used to reach a given destination.
  64  * During startup the BTL modules are queried via their
  65  * mca_btl_base_add_proc_fn_t() to determine if they are able to reach
  66  * a given destination.  The BTL module with the highest ranking that
  67  * returns success is selected. Subsequent BTL modules are selected only
  68  * if they have the same exclusivity ranking.
  69  *
  70  * An example of how this might be used:
  71  *
  72  * BTL         Exclusivity   Comments
  73  * --------    -----------   ------------------
  74  * LO              100       Selected exclusively for local process
  75  * SM               50       Selected exclusively for other processes on host
  76  * IB                0       Selected based on network reachability
  77  * IB                0       Selected based on network reachability
  78  * TCP               0       Selected based on network reachability
  79  * TCP               0       Selected based on network reachability
  80  *
  81  * When mca_btl_base_add_proc_fn_t() is called on a  BTL module, the BTL
  82  * will populate an OUT variable with mca_btl_base_endpoint_t pointers.
  83  * Each pointer is treated as an opaque handle by the upper layer and is
  84  * returned to the BTL on subsequent data transfer calls to the
  85  * corresponding destination process.  The actual contents of the
  86  * data structure are defined on a per BTL basis, and may be used to
  87  * cache addressing or connection information, such as a TCP socket
  88  * or IB queue pair.
  89  *
  90  * Progress:
  91  *
  92  * By default, the library provides for polling based progress of outstanding
  93  * requests. The BTL component exports an interface function (btl_progress)
  94  * that is called in a polling mode by the PML during calls into the MPI
  95  * library. Note that the btl_progress() function is called on the BTL component
  96  * rather than each BTL module. This implies that the BTL author is responsible
  97  * for iterating over the pending operations in each of the BTL modules associated
  98  * with the component.
  99  *
 100  * On platforms where threading support is provided, the library provides the
 101  * option of building with asynchronous threaded progress. In this case, the BTL
 102  * author is responsible for providing a thread to progress pending operations.
 103  * A thread is associated with the BTL component/module such that transport specific
 104  * functionality/APIs may be used to block the thread until a pending operation
 105  * completes. This thread MUST NOT poll for completion as this would oversubscribe
 106  * the CPU.
 107  *
 108  * Note that in the threaded case the PML may choose to use a hybrid approach,
 109  * such that polling is implemented from the user thread for a fixed number of
 110  * cycles before relying on the background thread(s) to complete requests. If
 111  * possible the BTL should support the use of both modes concurrently.
 112  *
 113  */
 114 
 115 #ifndef OPAL_MCA_BTL_H
 116 #define OPAL_MCA_BTL_H
 117 
 118 #include "opal_config.h"
 119 #include "opal/types.h"
 120 #include "opal/prefetch.h" /* For OPAL_LIKELY */
 121 #include "opal/class/opal_bitmap.h"
 122 #include "opal/datatype/opal_convertor.h"
 123 #include "opal/mca/mca.h"
 124 #include "opal/mca/mpool/mpool.h"
 125 #include "opal/mca/rcache/rcache.h"
 126 #include "opal/mca/crs/crs.h"
 127 #include "opal/mca/crs/base/base.h"
 128 
 129 BEGIN_C_DECLS
 130 
 131 /*
 132  * BTL types
 133  */
 134 
 135 struct mca_btl_base_module_t;
 136 struct mca_btl_base_endpoint_t;
 137 struct mca_btl_base_descriptor_t;
 138 struct mca_mpool_base_resources_t;
 139 struct opal_proc_t;
 140 
 141 /**
 142  * Opaque registration handle for executing RDMA and atomic
 143  * operations on a memory region.
 144  *
 145  * This data inside this handle is appropriate for passing
 146  * to remote peers to execute RDMA and atomic operations. The
 147  * size needed to send the registration handle can be
 148  * obtained from the btl via the btl_registration_handle_size
 149  * member. If this size is 0 then no registration data is
 150  * needed to execute RDMA or atomic operations.
 151  */
 152 struct mca_btl_base_registration_handle_t;
 153 typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
 154 
 155 
 156 /* Wildcard endpoint for use in the register_mem function */
 157 #define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
 158 
 159 /* send/recv operations require tag matching */
 160 typedef uint8_t mca_btl_base_tag_t;
 161 
 162 #define MCA_BTL_NO_ORDER       255
 163 
 164 /*
 165  * Communication specific defines. There are a number of active message ID
 166  * that can be shred between all frameworks that need to communicate (i.e.
 167  * use the PML or the BTL directly). These ID are exchanged between the
 168  * processes, therefore they need to be identical everywhere. The simplest
 169  * approach is to have them defined as constants, and give each framework a
 170  * small number. Here is the rule that defines these ID (they are 8 bits):
 171  * - the first 3 bits are used to code the framework (i.e. PML, OSC, COLL)
 172  * - the remaining 5 bytes are used internally by the framework, and divided
 173  *   based on the components requirements. Therefore, the way the PML and
 174  * the OSC frameworks use these defines will be different. For more
 175  * information about how these framework ID are defined, take a look in the
 176  * header file associated with the framework.
 177  */
 178 #define MCA_BTL_AM_FRAMEWORK_MASK   0xD0
 179 #define MCA_BTL_TAG_BTL             0x20
 180 #define MCA_BTL_TAG_PML             0x40
 181 #define MCA_BTL_TAG_OSC_RDMA        0x60
 182 #define MCA_BTL_TAG_USR             0x80
 183 #define MCA_BTL_TAG_MAX             255 /* 1 + highest allowed tag num */
 184 
 185 /*
 186  * Reserved tags for specific BTLs. As multiple BTLs can be active
 187  * simultaneously, their tags should not collide.
 188  */
 189 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
 190 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
 191 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
 192 #define MCA_BTL_TAG_VADER             (MCA_BTL_TAG_BTL + 3)
 193 
 194 /* prefered protocol */
 195 #define MCA_BTL_FLAGS_SEND            0x0001
 196 #define MCA_BTL_FLAGS_PUT             0x0002
 197 #define MCA_BTL_FLAGS_GET             0x0004
 198 /* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
 199  * rdma_btls list. This allows the updated one-sided component to
 200  * use btls that are not otherwise used for send/recv. */
 201 #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
 202 
 203 /* btl can send directly from user buffer w/out registration */
 204 #define MCA_BTL_FLAGS_SEND_INPLACE    0x0008
 205 
 206 /* btl transport reliability flags - currently used only by the DR PML */
 207 #define MCA_BTL_FLAGS_NEED_ACK        0x0010
 208 #define MCA_BTL_FLAGS_NEED_CSUM       0x0020
 209 
 210 /** deprecated (BTL 3.0) */
 211 #define MCA_BTL_FLAGS_RDMA_MATCHED    0x0040
 212 
 213 /* btl needs local rdma completion */
 214 #define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080
 215 
 216  /* btl can do heterogeneous rdma operations on byte buffers */
 217 #define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100
 218 
 219 /* btl can support failover if enabled */
 220 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200
 221 
 222 #define MCA_BTL_FLAGS_CUDA_PUT        0x0400
 223 #define MCA_BTL_FLAGS_CUDA_GET        0x0800
 224 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
 225 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
 226 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
 227 
 228 /* btl can support signaled operations. BTLs that support this flag are
 229  * expected to provide a mechanism for asynchronous progress on descriptors
 230  * where the feature is requested. BTLs should also be aware that users can
 231  * (and probably will) turn this flag on and off using the MCA variable
 232  * system.
 233  */
 234 #define MCA_BTL_FLAGS_SIGNALED        0x4000
 235 
 236 /** The BTL supports network atomic operations */
 237 #define MCA_BTL_FLAGS_ATOMIC_OPS      0x08000
 238 /** The BTL supports fetching network atomic operations */
 239 #define MCA_BTL_FLAGS_ATOMIC_FOPS     0x10000
 240 
 241 /** The BTL requires add_procs to be with all procs including non-local. Shared-memory
 242  * BTLs should not set this flag. */
 243 #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000
 244 
 245 /* The BTL is using progress thread and need the protection on matching */
 246 #define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000
 247 
 248 /* The BTL supports RMDA flush */
 249 #define MCA_BTL_FLAGS_RDMA_FLUSH      0x80000
 250 
 251 /* Default exclusivity levels */
 252 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
 253 #define MCA_BTL_EXCLUSIVITY_DEFAULT  1024      /* GM/IB/etc. */
 254 #define MCA_BTL_EXCLUSIVITY_LOW      0         /* TCP used as a last resort */
 255 
 256 /* error callback flags */
 257 #define MCA_BTL_ERROR_FLAGS_FATAL 0x1
 258 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
 259 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
 260 
 261 /** registration flags. the access flags are a 1-1 mapping with the mpool
 262  * access flags. */
 263 enum {
 264     /** Allow local write on the registered region. If a region is registered
 265      * with this flag the registration can be used as the local handle for a
 266      * btl_get operation. */
 267     MCA_BTL_REG_FLAG_LOCAL_WRITE   = MCA_RCACHE_ACCESS_LOCAL_WRITE,
 268     /** Allow remote read on the registered region. If a region is registered
 269      * with this flag the registration can be used as the remote handle for a
 270      * btl_get operation. */
 271     MCA_BTL_REG_FLAG_REMOTE_READ   = MCA_RCACHE_ACCESS_REMOTE_READ,
 272     /** Allow remote write on the registered region. If a region is registered
 273      * with this flag the registration can be used as the remote handle for a
 274      * btl_put operation. */
 275     MCA_BTL_REG_FLAG_REMOTE_WRITE  = MCA_RCACHE_ACCESS_REMOTE_WRITE,
 276     /** Allow remote atomic operations on the registered region. If a region is
 277      * registered with this flag the registration can be used as the remote
 278      * handle for a btl_atomic_op or btl_atomic_fop operation. */
 279     MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC,
 280     /** Allow any btl operation on the registered region. If a region is registered
 281      * with this flag the registration can be used as the local or remote handle for
 282      * any btl operation. */
 283     MCA_BTL_REG_FLAG_ACCESS_ANY    = MCA_RCACHE_ACCESS_ANY,
 284 #if OPAL_CUDA_GDR_SUPPORT
 285     /** Region is in GPU memory */
 286     MCA_BTL_REG_FLAG_CUDA_GPU_MEM  = 0x00010000,
 287 #endif
 288 };
 289 
 290 /** supported atomic operations */
 291 enum {
 292     /** The btl supports atomic add */
 293     MCA_BTL_ATOMIC_SUPPORTS_ADD    = 0x00000001,
 294     /** The btl supports atomic bitwise and */
 295     MCA_BTL_ATOMIC_SUPPORTS_AND    = 0x00000200,
 296     /** The btl supports atomic bitwise or */
 297     MCA_BTL_ATOMIC_SUPPORTS_OR     = 0x00000400,
 298     /** The btl supports atomic bitwise exclusive or */
 299     MCA_BTL_ATOMIC_SUPPORTS_XOR    = 0x00000800,
 300 
 301     /** The btl supports logical and */
 302     MCA_BTL_ATOMIC_SUPPORTS_LAND   = 0x00001000,
 303     /** The btl supports logical or */
 304     MCA_BTL_ATOMIC_SUPPORTS_LOR    = 0x00002000,
 305     /** The btl supports logical exclusive or */
 306     MCA_BTL_ATOMIC_SUPPORTS_LXOR   = 0x00004000,
 307 
 308     /** The btl supports atomic swap */
 309     MCA_BTL_ATOMIC_SUPPORTS_SWAP   = 0x00010000,
 310 
 311     /** The btl supports atomic min */
 312     MCA_BTL_ATOMIC_SUPPORTS_MIN    = 0x00100000,
 313     /** The btl supports atomic min */
 314     MCA_BTL_ATOMIC_SUPPORTS_MAX    = 0x00200000,
 315 
 316     /** The btl supports 32-bit integer operations. Keep in mind the btl may
 317      * support only a subset of the available atomics. */
 318     MCA_BTL_ATOMIC_SUPPORTS_32BIT  = 0x01000000,
 319 
 320     /** The btl supports floating-point operations. Keep in mind the btl may
 321      * support only a subset of the available atomics and may not support
 322      * both 64 or 32-bit floating point. */
 323     MCA_BTL_ATOMIC_SUPPORTS_FLOAT  = 0x02000000,
 324 
 325     /** The btl supports atomic compare-and-swap */
 326     MCA_BTL_ATOMIC_SUPPORTS_CSWAP  = 0x10000000,
 327 
 328     /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
 329     MCA_BTL_ATOMIC_SUPPORTS_GLOB   = 0x20000000,
 330 };
 331 
 332 enum {
 333     /** Use 32-bit atomics */
 334     MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001,
 335     /** Use floating-point atomics */
 336     MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002,
 337 };
 338 
 339 enum mca_btl_base_atomic_op_t {
 340     /** Atomic add: (*remote_address) = (*remote_address) + operand */
 341     MCA_BTL_ATOMIC_ADD = 0x0001,
 342     /** Atomic and: (*remote_address) = (*remote_address) & operand */
 343     MCA_BTL_ATOMIC_AND = 0x0011,
 344     /** Atomic or: (*remote_address) = (*remote_address) | operand */
 345     MCA_BTL_ATOMIC_OR  = 0x0012,
 346     /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
 347     MCA_BTL_ATOMIC_XOR = 0x0014,
 348     /** Atomic logical and: (*remote_address) = (*remote_address) && operand */
 349     MCA_BTL_ATOMIC_LAND = 0x0015,
 350     /** Atomic logical or: (*remote_address) = (*remote_address) || operand */
 351     MCA_BTL_ATOMIC_LOR = 0x0016,
 352     /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */
 353     MCA_BTL_ATOMIC_LXOR = 0x0017,
 354     /** Atomic swap: (*remote_address) = operand */
 355     MCA_BTL_ATOMIC_SWAP = 0x001a,
 356     /** Atomic min */
 357     MCA_BTL_ATOMIC_MIN = 0x0020,
 358     /** Atomic max */
 359     MCA_BTL_ATOMIC_MAX = 0x0021,
 360 
 361     MCA_BTL_ATOMIC_LAST,
 362 };
 363 typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
 364 
 365 /**
 366  * Asynchronous callback function on completion of an operation.
 367  * Completion Semantics: The descriptor can be reused or returned to the
 368  *  BTL via mca_btl_base_module_free_fn_t. The operation has been queued to
 369  *  the network device or will otherwise make asynchronous progress without
 370  *  subsequent calls to btl_progress.
 371  *
 372  * @param[IN] module      the BTL module
 373  * @param[IN] endpoint    the BTL endpoint
 374  * @param[IN] descriptor  the BTL descriptor
 375  *
 376  */
 377 typedef void (*mca_btl_base_completion_fn_t)(
 378     struct mca_btl_base_module_t* module,
 379     struct mca_btl_base_endpoint_t* endpoint,
 380     struct mca_btl_base_descriptor_t* descriptor,
 381     int status);
 382 
 383 
 384 /**
 385  * Asynchronous callback function on completion of an rdma or atomic operation.
 386  * Completion Semantics: The rdma or atomic memory operation has completed
 387  * remotely (i.e.) is remotely visible and the caller is free to deregister
 388  * the local_handle or modify the memory in local_address.
 389  *
 390  * @param[IN] module        the BTL module
 391  * @param[IN] endpoint      the BTL endpoint
 392  * @param[IN] local_address local address for the operation (if any)
 393  * @param[IN] local_handle  local handle associated with the local_address
 394  * @param[IN] context       callback context supplied to the rdma/atomic operation
 395  * @param[IN] cbdata        callback data supplied to the rdma/atomic operation
 396  * @param[IN] status        status of the operation
 397  *
 398  */
 399 typedef void (*mca_btl_base_rdma_completion_fn_t)(
 400     struct mca_btl_base_module_t* module,
 401     struct mca_btl_base_endpoint_t* endpoint,
 402     void *local_address,
 403     struct mca_btl_base_registration_handle_t *local_handle,
 404     void *context,
 405     void *cbdata,
 406     int status);
 407 
 408 
 409 /**
 410  * Describes a region/segment of memory that is addressable
 411  * by an BTL.
 412  *
 413  * Note: In many cases the alloc and prepare methods of BTLs
 414  * do not return a mca_btl_base_segment_t but instead return a
 415  * subclass. Extreme care should be used when modifying
 416  * BTL segments to prevent overwriting internal BTL data.
 417  *
 418  * All BTLs MUST use base segments when calling registered
 419  * Callbacks.
 420  *
 421  * BTL MUST use mca_btl_base_segment_t or a subclass and
 422  * MUST store their segment length in btl_seg_size. BTLs
 423  * MUST specify a segment no larger than MCA_BTL_SEG_MAX_SIZE.
 424  */
 425 
 426 struct mca_btl_base_segment_t {
 427     /** Address of the memory */
 428     opal_ptr_t seg_addr;
 429      /** Length in bytes */
 430     uint64_t   seg_len;
 431 };
 432 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
 433 
 434 
 435 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && !defined(WORDS_BIGENDIAN)
 436 #define MCA_BTL_BASE_SEGMENT_HTON(s)                   \
 437         (s).seg_addr.lval = hton64((s).seg_addr.lval); \
 438         (s).seg_len = hton64((s).seg_len);
 439 #define MCA_BTL_BASE_SEGMENT_NTOH(s)                   \
 440         (s).seg_addr.lval = ntoh64((s).seg_addr.lval); \
 441         (s).seg_len = ntoh64((s).seg_len);
 442 #else
 443 #define MCA_BTL_BASE_SEGMENT_HTON(s)
 444 #define MCA_BTL_BASE_SEGMENT_NTOH(s)
 445 #endif
 446 /**
 447  * A descriptor that holds the parameters to a send/put/get
 448  * operation along w/ a callback routine that is called on
 449  * completion of the request.
 450  * Note: receive callbacks will store the incomming data segments in
 451  *       des_segments
 452  */
 453 
 454 struct mca_btl_base_descriptor_t {
 455     opal_free_list_item_t super;
 456     mca_btl_base_segment_t *des_segments;     /**< local segments */
 457     size_t des_segment_count;                 /**< number of local segments */
 458     mca_btl_base_completion_fn_t des_cbfunc;  /**< local callback function */
 459     void* des_cbdata;                         /**< opaque callback data */
 460     void* des_context;                        /**< more opaque callback data */
 461     uint32_t des_flags;                       /**< hints to BTL */
 462     /** order value, this is only
 463         valid in the local completion callback
 464         and may be used in subsequent calls to
 465         btl_alloc, btl_prepare_src to request
 466         a descriptor that will be ordered w.r.t.
 467         this descriptor
 468     */
 469     uint8_t order;
 470 };
 471 typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
 472 
 473 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
 474 
 475 #define MCA_BTL_DES_FLAGS_PRIORITY          0x0001
 476 /* Allow the BTL to dispose the descriptor once the callback
 477  * associated was triggered.
 478  */
 479 #define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP     0x0002
 480 /* Allow the BTL to avoid calling the descriptor callback
 481  * if the send succeded in the btl_send (i.e in the fast path).
 482  */
 483 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK    0x0004
 484 
 485 /* Tell the PML that the copy is being done asynchronously
 486  */
 487 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC   0x0008
 488 
 489 /* Type of transfer that will be done with this frag.
 490  */
 491 #define MCA_BTL_DES_FLAGS_PUT               0x0010
 492 #define MCA_BTL_DES_FLAGS_GET               0x0020
 493 
 494 /* Ask the BTL to wake the remote process (send/sendi) or local process
 495  * (put/get) to handle this message. The BTL may ignore this flag if
 496  * signaled operations are not supported.
 497  */
 498 #define MCA_BTL_DES_FLAGS_SIGNAL            0x0040
 499 
 500 /**
 501  * Maximum number of allowed segments in src/dst fields of a descriptor.
 502  */
 503 #define MCA_BTL_DES_MAX_SEGMENTS 16
 504 
 505 /**
 506  * Maximum size of a BTL segment (NTH: does it really save us anything
 507  * to hardcode this?)
 508  */
 509 #define MCA_BTL_SEG_MAX_SIZE 256
 510 
 511 /**
 512  * Maximum size of a BTL registration handle in bytes
 513  */
 514 #define MCA_BTL_REG_HANDLE_MAX_SIZE 256
 515 
 516 /*
 517  *  BTL base header, stores the tag at a minimum
 518  */
 519 struct mca_btl_base_header_t{
 520     mca_btl_base_tag_t tag;
 521 };
 522 typedef struct mca_btl_base_header_t mca_btl_base_header_t;
 523 
 524 #define MCA_BTL_BASE_HEADER_HTON(hdr)
 525 #define MCA_BTL_BASE_HEADER_NTOH(hdr)
 526 
 527 /*
 528  *  BTL component interface functions and datatype.
 529  */
 530 
 531 /**
 532  * MCA->BTL Initializes the BTL component and creates specific BTL
 533  * module(s).
 534  *
 535  * @param num_btls (OUT) Returns the number of btl modules created, or 0
 536  *                       if the transport is not available.
 537  *
 538  * @param enable_progress_threads (IN) Whether this component is
 539  * allowed to run a hidden/progress thread or not.
 540  *
 541  * @param enable_mpi_threads (IN) Whether support for multiple MPI
 542  * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
 543  * indicates whether multiple threads may invoke this component
 544  * simultaneously or not.
 545  *
 546  * @return Array of pointers to BTL modules, or NULL if the transport
 547  *         is not available.
 548  *
 549  * During component initialization, the BTL component should discover
 550  * the physical devices that are available for the given transport,
 551  * and create a BTL module to represent each device. Any addressing
 552  * information required by peers to reach the device should be published
 553  * during this function via the modex_send() interface.
 554  *
 555  */
 556 
 557 typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
 558     int *num_btls,
 559     bool enable_progress_threads,
 560     bool enable_mpi_threads
 561 );
 562 
 563 /**
 564  * MCA->BTL Called to progress outstanding requests for
 565  * non-threaded polling environments.
 566  *
 567  * @return           Count of "completions", a metric of
 568  *                   how many items where completed in the call
 569  *                   to progress.
 570  */
 571 
 572 typedef int (*mca_btl_base_component_progress_fn_t)(void);
 573 
 574 
 575 /**
 576  * Callback function that is called asynchronously on receipt
 577  * of data by the transport layer.
 578  * Note that the the mca_btl_base_descriptor_t is only valid within the
 579  * completion function, this implies that all data payload in the
 580  * mca_btl_base_descriptor_t must be copied out within this callback or
 581  * forfeited back to the BTL.
 582  * Note also that descriptor segments (des_segments) must be base
 583  * segments for all callbacks.
 584  *
 585  * @param[IN] btl        BTL module
 586  * @param[IN] tag        The active message receive callback tag value
 587  * @param[IN] descriptor The BTL descriptor (contains the receive payload)
 588  * @param[IN] cbdata     Opaque callback data
 589  */
 590 
 591 typedef void (*mca_btl_base_module_recv_cb_fn_t)(
 592     struct mca_btl_base_module_t* btl,
 593     mca_btl_base_tag_t tag,
 594     mca_btl_base_descriptor_t* descriptor,
 595     void* cbdata
 596 );
 597 
 598 typedef struct mca_btl_active_message_callback_t {
 599     mca_btl_base_module_recv_cb_fn_t cbfunc;
 600     void* cbdata;
 601 } mca_btl_active_message_callback_t;
 602 
 603 OPAL_DECLSPEC extern
 604 mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TAG_MAX];
 605 
 606 /**
 607  *  BTL component descriptor. Contains component version information
 608  *  and component open/close/init functions.
 609  */
 610 
 611 struct mca_btl_base_component_3_0_0_t {
 612   mca_base_component_t btl_version;
 613   mca_base_component_data_t btl_data;
 614   mca_btl_base_component_init_fn_t btl_init;
 615   mca_btl_base_component_progress_fn_t btl_progress;
 616 };
 617 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t;
 618 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t;
 619 
 620 /*  add the 2_0_0_t typedef for source compatibility
 621  *  we can do this safely because 2_0_0 components are the same as
 622  *  3_0_0 components, the difference is in the btl module.
 623  *  Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and
 624  *  can not be used with the new interface.
 625  */
 626 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t;
 627 
 628 
 629 /*
 630  * BTL module interface functions and datatype.
 631  */
 632 
 633 /**
 634  * MCA->BTL Clean up any resources held by BTL module
 635  * before the module is unloaded.
 636  *
 637  * @param btl (IN)   BTL module.
 638  * @return           OPAL_SUCCESS or error status on failure.
 639  *
 640  * Prior to unloading a BTL module, the MCA framework will call
 641  * the BTL finalize method of the module. Any resources held by
 642  * the BTL should be released and if required the memory corresponding
 643  * to the BTL module freed.
 644  *
 645  */
 646 typedef int (*mca_btl_base_module_finalize_fn_t)(
 647     struct mca_btl_base_module_t* btl
 648 );
 649 
 650 /**
 651  * BML->BTL notification of change in the process list.
 652  *
 653  * @param btl (IN)            BTL module
 654  * @param nprocs (IN)         Number of processes
 655  * @param procs (IN)          Array of processes
 656  * @param endpoint (OUT)      Array of mca_btl_base_endpoint_t structures by BTL.
 657  * @param reachable (OUT)     Bitmask indicating set of peer processes that are reachable by this BTL.
 658  * @return                    OPAL_SUCCESS or error status on failure.
 659  *
 660  * The mca_btl_base_module_add_procs_fn_t() is called by the BML to
 661  * determine the set of BTLs that should be used to reach each process.
 662  * Any addressing information exported by the peer via the modex_send()
 663  * function should be available during this call via the corresponding
 664  * modex_recv() function. The BTL may utilize this information to
 665  * determine reachability of each peer process.
 666  *
 667  * The caller may pass a "reachable" bitmap pointer.  If it is not
 668  * NULL, for each process that is reachable by the BTL, the bit
 669  * corresponding to the index into the proc array (nprocs) should be
 670  * set in the reachable bitmask. The BTL will return an array of
 671  * pointers to a data structure defined by the BTL that is then
 672  * returned to the BTL on subsequent calls to the BTL data transfer
 673  * functions (e.g btl_send). This may be used by the BTL to cache any
 674  * addressing or connection information (e.g. TCP socket, IB queue
 675  * pair).
 676  */
 677 typedef int (*mca_btl_base_module_add_procs_fn_t)(
 678     struct mca_btl_base_module_t* btl,
 679     size_t nprocs,
 680     struct opal_proc_t** procs,
 681     struct mca_btl_base_endpoint_t** endpoints,
 682     struct opal_bitmap_t* reachable
 683 );
 684 
 685 /**
 686  * Notification of change to the process list.
 687  *
 688  * @param btl (IN)     BTL module
 689  * @param nprocs (IN)  Number of processes
 690  * @param proc (IN)    Set of processes
 691  * @param peer (IN)    Set of peer addressing information.
 692  * @return             Status indicating if cleanup was successful
 693  *
 694  * When the process list changes, the BML notifies the BTL of the
 695  * change, to provide the opportunity to cleanup or release any
 696  * resources associated with the peer.
 697  */
 698 typedef int (*mca_btl_base_module_del_procs_fn_t)(
 699     struct mca_btl_base_module_t* btl,
 700     size_t nprocs,
 701     struct opal_proc_t** procs,
 702     struct mca_btl_base_endpoint_t** peer
 703 );
 704 
 705 /**
 706  * Register a callback function that is called on receipt
 707  * of a fragment.
 708  *
 709  * @param[IN] btl      BTL module
 710  * @param[IN] tag      tag value of this callback
 711  *                     (specified on subsequent send operations)
 712  * @param[IN] cbfunc   The callback function
 713  * @param[IN] cbdata   Opaque callback data
 714  *
 715  * @return OPAL_SUCCESS The callback was registered successfully
 716  * @return OPAL_ERROR   The callback was NOT registered successfully
 717  *
 718  */
 719 typedef int (*mca_btl_base_module_register_fn_t)(
 720     struct mca_btl_base_module_t* btl,
 721     mca_btl_base_tag_t tag,
 722     mca_btl_base_module_recv_cb_fn_t cbfunc,
 723     void* cbdata
 724 );
 725 
 726 
 727 /**
 728  * Callback function that is called asynchronously on receipt
 729  * of an error from the transport layer
 730  *
 731  * @param[IN] btl     BTL module
 732  * @param[IN] flags   type of error
 733  * @param[IN] errproc process that had an error
 734  * @param[IN] btlinfo descriptive string from the BTL
 735  */
 736 
 737 typedef void (*mca_btl_base_module_error_cb_fn_t)(
 738         struct mca_btl_base_module_t* btl,
 739         int32_t flags,
 740         struct opal_proc_t* errproc,
 741         char* btlinfo
 742 );
 743 
 744 
 745 /**
 746  * Register a callback function that is called on receipt
 747  * of an error.
 748  *
 749  * @param[IN] btl       BTL module
 750  * @param[IN] cbfunc    The callback function
 751  *
 752  * @return OPAL_SUCCESS The callback was registered successfully
 753  * @return OPAL_ERROR   The callback was NOT registered successfully
 754  *
 755  */
 756 typedef int (*mca_btl_base_module_register_error_fn_t)(
 757     struct mca_btl_base_module_t* btl,
 758     mca_btl_base_module_error_cb_fn_t cbfunc
 759 );
 760 
 761 
 762 /**
 763  * Allocate a descriptor with a segment of the requested size.
 764  * Note that the BTL layer may choose to return a smaller size
 765  * if it cannot support the request. The order tag value ensures that
 766  * operations on the descriptor that is allocated will be
 767  * ordered w.r.t. a previous operation on a particular descriptor.
 768  * Ordering is only guaranteed if the previous descriptor had its
 769  * local completion callback function called and the order tag of
 770  * that descriptor is only valid upon the local completion callback function.
 771  *
 772  *
 773  * @param btl (IN)      BTL module
 774  * @param size (IN)     Request segment size.
 775  * @param order (IN)    The ordering tag (may be MCA_BTL_NO_ORDER)
 776  */
 777 
 778 typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)(
 779     struct mca_btl_base_module_t* btl,
 780     struct mca_btl_base_endpoint_t* endpoint,
 781     uint8_t order,
 782     size_t size,
 783     uint32_t flags
 784 );
 785 
 786 /**
 787  * Return a descriptor allocated from this BTL via alloc/prepare.
 788  * A descriptor can only be deallocated after its local completion
 789  * callback function has called for all send/put/get operations.
 790  *
 791  * @param btl (IN)      BTL module
 792  * @param segment (IN)  Descriptor allocated from the BTL
 793  */
 794 typedef int (*mca_btl_base_module_free_fn_t)(
 795     struct mca_btl_base_module_t* btl,
 796     mca_btl_base_descriptor_t* descriptor
 797 );
 798 
 799 
 800 /**
 801  * Prepare a descriptor for send using the supplied convertor. If the convertor
 802  * references data that is contiguous, the descriptor may simply point to the
 803  * user buffer. Otherwise, this routine is responsible for allocating buffer
 804  * space and packing if required.
 805  *
 806  * The order tag value ensures that operations on the
 807  * descriptor that is prepared will be ordered w.r.t. a previous
 808  * operation on a particular descriptor. Ordering is only guaranteed if
 809  * the previous descriptor had its local completion callback function
 810  * called and the order tag of that descriptor is only valid upon the local
 811  * completion callback function.
 812  *
 813  * @param btl (IN)          BTL module
 814  * @param endpoint (IN)     BTL peer addressing
 815  * @param registration (IN) Memory registration
 816  * @param convertor (IN)    Data type convertor
 817  * @param order (IN)        The ordering tag (may be MCA_BTL_NO_ORDER)
 818  * @param reserve (IN)      Additional bytes requested by upper layer to precede user data
 819  * @param size (IN/OUT)     Number of bytes to prepare (IN),
 820  *                          number of bytes actually prepared (OUT)
 821  *
 822  */
 823 typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
 824     struct mca_btl_base_module_t* btl,
 825     struct mca_btl_base_endpoint_t* endpoint,
 826     struct opal_convertor_t* convertor,
 827     uint8_t order,
 828     size_t reserve,
 829     size_t* size,
 830     uint32_t flags
 831 );
 832 
 833 /**
 834  * @brief Register a memory region for put/get/atomic operations.
 835  *
 836  * @param btl (IN)         BTL module
 837  * @param endpoint(IN)     BTL addressing information (or NULL for all endpoints)
 838  * @param base (IN)        Pointer to start of region
 839  * @param size (IN)        Size of region
 840  * @param flags (IN)       Flags including access permissions
 841  *
 842  * @returns a memory registration handle valid for both local and remote operations
 843  * @returns NULL if the region could not be registered
 844  *
 845  * This function registers the specified region with the hardware for use with
 846  * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
 847  * functions. Care should be taken to not hold an excessive number of registrations
 848  * as they may use limited system/NIC resources.
 849  *
 850  * Ownership of the memory pointed to by the returned (struct
 851  * mca_btl_base_registration_handle_t*) is passed to the caller.  The
 852  * BTL module cannot free or reuse the handle until it is returned via
 853  * the mca_btl_base_module_deregister_mem_fn_t function.
 854  */
 855 typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
 856     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
 857     size_t size, uint32_t flags);
 858 
 859 /**
 860  * @brief Deregister a memory region
 861  *
 862  * @param btl (IN)         BTL module region was registered with
 863  * @param handle (IN)      BTL registration handle to deregister
 864  *
 865  * This function deregisters the memory region associated with the specified handle. Care
 866  * should be taken to not perform any RDMA or atomic operation on this memory region
 867  * after it is deregistered. It is erroneous to specify a memory handle associated with
 868  * a remote node.
 869  *
 870  * The handle passed in will be a value previously returned by the
 871  * mca_btl_base_module_register_mem_fn_t function.  Ownership of the
 872  * memory pointed to by handle passes to the BTL module; this function
 873  * is now is allowed to free the memory, return it to a freelist, etc.
 874  */
 875 typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
 876     struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
 877 
 878 /**
 879  * Initiate an asynchronous send.
 880  * Completion Semantics: the descriptor has been queued for a send operation
 881  *                       the BTL now controls the descriptor until local
 882  *                       completion callback is made on the descriptor
 883  *
 884  * All BTLs allow multiple concurrent asynchronous send operations on a descriptor
 885  *
 886  * @param btl (IN)         BTL module
 887  * @param endpoint (IN)    BTL addressing information
 888  * @param descriptor (IN)  Description of the data to be transfered
 889  * @param tag (IN)         The tag value used to notify the peer.
 890  *
 891  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a send
 892  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a send
 893  * @retval OPAL_ERR_UNREACH The endpoint is not reachable
 894  */
 895 typedef int (*mca_btl_base_module_send_fn_t)(
 896     struct mca_btl_base_module_t* btl,
 897     struct mca_btl_base_endpoint_t* endpoint,
 898     struct mca_btl_base_descriptor_t* descriptor,
 899     mca_btl_base_tag_t tag
 900 );
 901 
 902 /**
 903  * Initiate an immediate blocking send.
 904  * Completion Semantics: the BTL will make a best effort
 905  *  to send the header and "size" bytes from the datatype using the convertor.
 906  *  The header is guaranteed to be delivered entirely in the first segment.
 907  *  Should the BTL be unable to deliver the data due to resource constraints
 908  *  the BTL will return a descriptor (via the OUT param)
 909  *  of size "payload_size + header_size".
 910  *
 911  * @param btl (IN)             BTL module
 912  * @param endpoint (IN)        BTL addressing information
 913  * @param convertor (IN)       Data type convertor
 914  * @param header (IN)          Pointer to header.
 915  * @param header_size (IN)     Size of header.
 916  * @param payload_size (IN)    Size of payload (from convertor).
 917  * @param order (IN)           The ordering tag (may be MCA_BTL_NO_ORDER)
 918  * @param flags (IN)           Flags.
 919  * @param tag (IN)             The tag value used to notify the peer.
 920  * @param descriptor (OUT)     The descriptor to be returned unable to be sent immediately
 921  *                             (may be NULL).
 922  *
 923  * @retval OPAL_SUCCESS           The send was successfully queued
 924  * @retval OPAL_ERROR             The send failed
 925  * @retval OPAL_ERR_UNREACH       The endpoint is not reachable
 926  * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
 927  *                                (via the OUT param) if descriptors are available
 928  */
 929 
 930 typedef int (*mca_btl_base_module_sendi_fn_t)(
 931     struct mca_btl_base_module_t* btl,
 932     struct mca_btl_base_endpoint_t* endpoint,
 933     struct opal_convertor_t* convertor,
 934     void* header,
 935     size_t header_size,
 936     size_t payload_size,
 937     uint8_t order,
 938     uint32_t flags,
 939     mca_btl_base_tag_t tag,
 940     mca_btl_base_descriptor_t** descriptor
 941  );
 942 
 943 /**
 944  * Initiate an asynchronous put.
 945  * Completion Semantics: if this function returns a 1 then the operation
 946  *                       is complete. a return of OPAL_SUCCESS indicates
 947  *                       the put operation has been queued with the
 948  *                       network. the local_handle can not be deregistered
 949  *                       until all outstanding operations on that handle
 950  *                       have been completed.
 951  *
 952  * @param btl (IN)            BTL module
 953  * @param endpoint (IN)       BTL addressing information
 954  * @param local_address (IN)  Local address to put from (registered)
 955  * @param remote_address (IN) Remote address to put to (registered remotely)
 956  * @param local_handle (IN)   Registration handle for region containing
 957  *                            (local_address, local_address + size)
 958  * @param remote_handle (IN)  Remote registration handle for region containing
 959  *                            (remote_address, remote_address + size)
 960  * @param size (IN)           Number of bytes to put
 961  * @param flags (IN)          Flags for this put operation
 962  * @param order (IN)          Ordering
 963  * @param cbfunc (IN)         Function to call on completion (if queued)
 964  * @param cbcontext (IN)      Context for the callback
 965  * @param cbdata (IN)         Data for callback
 966  *
 967  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
 968  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
 969  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
 970  *                         operation. Try again later
 971  * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
 972  *                         alignment restrictions.
 973  */
 974 typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
 975     struct mca_btl_base_endpoint_t *endpoint, void *local_address,
 976     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
 977     struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
 978     int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
 979 
 980 /**
 981  * Initiate an asynchronous get.
 982  * Completion Semantics: if this function returns a 1 then the operation
 983  *                       is complete. a return of OPAL_SUCCESS indicates
 984  *                       the get operation has been queued with the
 985  *                       network. the local_handle can not be deregistered
 986  *                       until all outstanding operations on that handle
 987  *                       have been completed.
 988  *
 989  * @param btl (IN)            BTL module
 990  * @param endpoint (IN)       BTL addressing information
 991  * @param local_address (IN)  Local address to put from (registered)
 992  * @param remote_address (IN) Remote address to put to (registered remotely)
 993  * @param local_handle (IN)   Registration handle for region containing
 994  *                            (local_address, local_address + size)
 995  * @param remote_handle (IN)  Remote registration handle for region containing
 996  *                            (remote_address, remote_address + size)
 997  * @param size (IN)           Number of bytes to put
 998  * @param flags (IN)          Flags for this put operation
 999  * @param order (IN)          Ordering
1000  * @param cbfunc (IN)         Function to call on completion (if queued)
1001  * @param cbcontext (IN)      Context for the callback
1002  * @param cbdata (IN)         Data for callback
1003  *
1004  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
1005  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
1006  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
1007  *                         operation. Try again later
1008  * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
1009  *                         alignment restrictions.
1010  */
1011 typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
1012     struct mca_btl_base_endpoint_t *endpoint, void *local_address,
1013     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
1014     struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
1015     int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
1016 
1017 /**
1018  * Initiate an asynchronous atomic operation.
1019  * Completion Semantics: if this function returns a 1 then the operation
1020  *                       is complete. a return of OPAL_SUCCESS indicates
1021  *                       the atomic operation has been queued with the
1022  *                       network.
1023  *
1024  * @param btl (IN)            BTL module
1025  * @param endpoint (IN)       BTL addressing information
1026  * @param remote_address (IN) Remote address to put to (registered remotely)
1027  * @param remote_handle (IN)  Remote registration handle for region containing
1028  *                            (remote_address, remote_address + 8)
1029  * @param op (IN)             Operation to perform
1030  * @param operand (IN)        Operand for the operation
1031  * @param flags (IN)          Flags for this atomic operation
1032  * @param order (IN)          Ordering
1033  * @param cbfunc (IN)         Function to call on completion (if queued)
1034  * @param cbcontext (IN)      Context for the callback
1035  * @param cbdata (IN)         Data for callback
1036  *
1037  * @retval OPAL_SUCCESS    The operation was successfully queued
1038  * @retval 1               The operation is complete
1039  * @retval OPAL_ERROR      The operation was NOT successfully queued
1040  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1041  *                         operation. Try again later
1042  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1043  *                         alignment restrictions or the operation {op} is not supported
1044  *                         by the hardware.
1045  *
1046  * After the operation is complete the remote address specified by {remote_address} and
1047  * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1048  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1049  * however, that not all btls will provide consistency between btl atomic operations and
1050  * cpu or other btl atomics.
1051  */
1052 typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
1053     struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
1054     struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1055     uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1056     void *cbcontext, void *cbdata);
1057 
1058 /**
1059  * Initiate an asynchronous fetching atomic operation.
1060  * Completion Semantics: if this function returns a 1 then the operation
1061  *                       is complete. a return of OPAL_SUCCESS indicates
1062  *                       the atomic operation has been queued with the
1063  *                       network.
1064  *
1065  * @param btl (IN)            BTL module
1066  * @param endpoint (IN)       BTL addressing information
1067  * @param local_address (OUT) Local address to store the result in
1068  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1069  * @param local_handle (IN)   Local registration handle for region containing
1070  *                            (local_address, local_address + 8)
1071  * @param remote_handle (IN)  Remote registration handle for region containing
1072  *                            (remote_address, remote_address + 8)
1073  * @param op (IN)             Operation to perform
1074  * @param operand (IN)        Operand for the operation
1075  * @param flags (IN)          Flags for this atomic operation
1076  * @param order (IN)          Ordering
1077  * @param cbfunc (IN)         Function to call on completion (if queued)
1078  * @param cbcontext (IN)      Context for the callback
1079  * @param cbdata (IN)         Data for callback
1080  *
1081  * @retval OPAL_SUCCESS    The operation was successfully queued
1082  * @retval 1               The operation is complete
1083  * @retval OPAL_ERROR      The operation was NOT successfully queued
1084  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1085  *                         operation. Try again later
1086  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1087  *                         alignment restrictions or the operation {op} is not supported
1088  *                         by the hardware.
1089  *
1090  * After the operation is complete the remote address specified by {remote_address} and
1091  * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1092  * {local_address} will be updated with the previous value stored in {remote_address}.
1093  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1094  * however, that not all btls will provide consistency between btl atomic operations and
1095  * cpu or other btl atomics.
1096  */
1097 typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
1098     struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1099     struct mca_btl_base_registration_handle_t *local_handle,
1100     struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1101     uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1102     void *cbcontext, void *cbdata);
1103 
1104 /**
1105  * Initiate an asynchronous compare and swap operation.
1106  * Completion Semantics: if this function returns a 1 then the operation
1107  *                       is complete. a return of OPAL_SUCCESS indicates
1108  *                       the atomic operation has been queued with the
1109  *                       network.
1110  *
1111  * @param btl (IN)            BTL module
1112  * @param endpoint (IN)       BTL addressing information
1113  * @param local_address (OUT) Local address to store the result in
1114  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1115  * @param local_handle (IN)   Local registration handle for region containing
1116  *                            (local_address, local_address + 8)
1117  * @param remote_handle (IN)  Remote registration handle for region containing
1118  *                            (remote_address, remote_address + 8)
1119  * @param compare (IN)        Operand for the operation
1120  * @param value (IN)          Value to store on success
1121  * @param flags (IN)          Flags for this atomic operation
1122  * @param order (IN)          Ordering
1123  * @param cbfunc (IN)         Function to call on completion (if queued)
1124  * @param cbcontext (IN)      Context for the callback
1125  * @param cbdata (IN)         Data for callback
1126  *
1127  * @retval OPAL_SUCCESS    The operation was successfully queued
1128  * @retval 1               The operation is complete
1129  * @retval OPAL_ERROR      The operation was NOT successfully queued
1130  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1131  *                         operation. Try again later
1132  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1133  *                         alignment restrictions or the operation {op} is not supported
1134  *                         by the hardware.
1135  *
1136  * After the operation is complete the remote address specified by {remote_address} and
1137  * {remote_handle} will be updated with {value} if *remote_address == compare.
1138  * {local_address} will be updated with the previous value stored in {remote_address}.
1139  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1140  * however, that not all btls will provide consistency between btl atomic operations and
1141  * cpu atomics.
1142  */
1143 typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl,
1144     struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1145     struct mca_btl_base_registration_handle_t *local_handle,
1146     struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
1147     uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1148     void *cbcontext, void *cbdata);
1149 
1150 /**
1151  * Diagnostic dump of btl state.
1152  *
1153  * @param btl (IN)         BTL module
1154  * @param endpoint (IN)    BTL endpoint
1155  * @param verbose (IN)     Verbosity level
1156  */
1157 
1158 typedef void (*mca_btl_base_module_dump_fn_t)(
1159     struct mca_btl_base_module_t* btl,
1160     struct mca_btl_base_endpoint_t* endpoint,
1161     int verbose
1162 );
1163 
1164 /**
1165  * Fault Tolerance Event Notification Function
1166  * @param state Checkpoint Status
1167  * @return OPAL_SUCCESS or failure status
1168  */
1169 typedef int (*mca_btl_base_module_ft_event_fn_t)(int state);
1170 
1171 /**
1172  * Flush all outstanding RDMA operations on an endpoint or all endpoints.
1173  *
1174  * @param btl (IN)         BTL module
1175  * @param endpoint (IN)    Endpoint to flush (NULL == all)
1176  *
1177  * This function returns when all outstanding RDMA (put, get, atomic) operations
1178  * that were started prior to the flush call have completed. This call does
1179  * NOT guarantee that all BTL callbacks have been completed.
1180  *
1181  * The BTL is allowed to ignore the endpoint parameter and flush *all* endpoints.
1182  */
1183 typedef int (*mca_btl_base_module_flush_fn_t) (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
1184 
1185 /**
1186  * BTL module interface functions and attributes.
1187  */
1188 struct mca_btl_base_module_t {
1189 
1190     /* BTL common attributes */
1191     mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
1192     size_t      btl_eager_limit;      /**< maximum size of first fragment -- eager send */
1193     size_t      btl_rndv_eager_limit;    /**< the size of a data sent in a first fragment of rendezvous protocol */
1194     size_t      btl_max_send_size;    /**< maximum send fragment size supported by the BTL */
1195     size_t      btl_rdma_pipeline_send_length; /**< amount of bytes that should be send by pipeline protocol */
1196     size_t      btl_rdma_pipeline_frag_size; /**< maximum rdma fragment size supported by the BTL */
1197     size_t      btl_min_rdma_pipeline_size; /**< minimum packet size for pipeline protocol  */
1198     uint32_t    btl_exclusivity;      /**< indicates this BTL should be used exclusively */
1199     uint32_t    btl_latency;          /**< relative ranking of latency used to prioritize btls */
1200     uint32_t    btl_bandwidth;        /**< bandwidth (Mbytes/sec) supported by each endpoint */
1201     uint32_t    btl_flags;            /**< flags (put/get...) */
1202     uint32_t    btl_atomic_flags;     /**< atomic operations supported (add, and, xor, etc) */
1203     size_t      btl_registration_handle_size; /**< size of the BTLs registration handles */
1204 
1205     /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
1206     size_t      btl_get_limit;        /**< maximum size supported by the btl_get function */
1207     size_t      btl_get_alignment;    /**< minimum alignment/size needed by btl_get (power of 2) */
1208     size_t      btl_put_limit;        /**< maximum size supported by the btl_put function */
1209     size_t      btl_put_alignment;    /**< minimum alignment/size needed by btl_put (power of 2) */
1210 
1211     /* minimum transaction sizes for which registration is required for local memory */
1212     size_t      btl_get_local_registration_threshold;
1213     size_t      btl_put_local_registration_threshold;
1214 
1215     /* BTL function table */
1216     mca_btl_base_module_add_procs_fn_t      btl_add_procs;
1217     mca_btl_base_module_del_procs_fn_t      btl_del_procs;
1218     mca_btl_base_module_register_fn_t       btl_register;
1219     mca_btl_base_module_finalize_fn_t       btl_finalize;
1220 
1221     mca_btl_base_module_alloc_fn_t          btl_alloc;
1222     mca_btl_base_module_free_fn_t           btl_free;
1223     mca_btl_base_module_prepare_fn_t        btl_prepare_src;
1224     mca_btl_base_module_send_fn_t           btl_send;
1225     mca_btl_base_module_sendi_fn_t          btl_sendi;
1226     mca_btl_base_module_put_fn_t            btl_put;
1227     mca_btl_base_module_get_fn_t            btl_get;
1228     mca_btl_base_module_dump_fn_t           btl_dump;
1229 
1230     /* atomic operations */
1231     mca_btl_base_module_atomic_op64_fn_t    btl_atomic_op;
1232     mca_btl_base_module_atomic_fop64_fn_t   btl_atomic_fop;
1233     mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap;
1234 
1235     /* new memory registration functions */
1236     mca_btl_base_module_register_mem_fn_t   btl_register_mem;   /**< memory registration function (NULL if not needed) */
1237     mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
1238 
1239     /** the mpool associated with this btl (optional) */
1240     mca_mpool_base_module_t*             btl_mpool;
1241     /** register a default error handler */
1242     mca_btl_base_module_register_error_fn_t btl_register_error;
1243     /** fault tolerant even notification */
1244     mca_btl_base_module_ft_event_fn_t btl_ft_event;
1245 #if OPAL_CUDA_GDR_SUPPORT
1246     size_t      btl_cuda_eager_limit;  /**< switch from eager to RDMA */
1247     size_t      btl_cuda_rdma_limit;   /**< switch from RDMA to rndv pipeline */
1248 #endif /* OPAL_CUDA_GDR_SUPPORT */
1249 #if OPAL_CUDA_SUPPORT
1250     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
1251 #endif /* OPAL_CUDA_SUPPORT */
1252 
1253     mca_btl_base_module_flush_fn_t btl_flush; /**< flush all previous operations on an endpoint */
1254 
1255     unsigned char padding[256]; /**< padding to future-proof the btl module */
1256 };
1257 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
1258 
1259 /*
1260  * Macro for use in modules that are of type btl v3.1.0
1261  */
1262 #define MCA_BTL_BASE_VERSION_3_1_0              \
1263     OPAL_MCA_BASE_VERSION_2_1_0("btl", 3, 1, 0)
1264 
1265 #define MCA_BTL_DEFAULT_VERSION(name)                       \
1266     MCA_BTL_BASE_VERSION_3_1_0,                             \
1267     .mca_component_name = name,                             \
1268     MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, \
1269                           OPAL_RELEASE_VERSION)
1270 
1271 /**
1272  * Convinience macro for detecting the BTL interface version.
1273  */
1274 #define BTL_VERSION 310
1275 
1276 END_C_DECLS
1277 
1278 #endif /* OPAL_MCA_BTL_H */

/* [<][>][^][v][top][bottom][index][help] */