1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2016 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights
14 * reserved.
15 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
16 * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
17 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
18 * Copyright (c) 2015 Research Organization for Information Science
19 * and Technology (RIST). All rights reserved.
20 * $COPYRIGHT$
21 *
22 * Additional copyrights may follow
23 *
24 * $HEADER$
25 */
26 /**
27 * @file
28 *
29 * Byte Transfer Layer (BTL)
30 *
31 *
32 * BTL Initialization:
33 *
34 * During library initialization, all available BTL components are
35 * loaded and opened via their mca_base_open_component_fn_t
36 * function. The BTL open function should register any mca parameters
37 * used to tune/adjust the behaviour of the BTL (mca_base_var_register()
38 * mca_base_component_var_register()). Note that the open function may fail
39 * if the resources (e.g. shared libraries, etc) required by the network
40 * transport are not available.
41 *
42 * The mca_btl_base_component_init_fn_t() is then called for each of the
43 * components that are succesfully opened. The component init function may
44 * return either:
45 *
46 * (1) a NULL list of BTL modules if the transport is not available,
47 * (2) a list containing a one or more single BTL modules, where the BTL provides
48 * a layer of abstraction over one or more physical devices (e.g. NICs),
49 *
50 * During module initialization, the module should post any addressing
51 * information required by its peers. An example would be the TCP
52 * listen port opened by the TCP module for incoming connection
53 * requests. This information is published to peers via the
54 * modex_send() interface. Note that peer information is not
55 * guaranteed to be available via modex_recv() during the
56 * module's init function. However, it will be available during
57 * BTL selection (mca_btl_base_add_proc_fn_t()).
58 *
59 * BTL Selection:
60 *
61 * The upper layer builds an ordered list of the available BTL modules sorted
62 * by their exclusivity ranking. This is a relative ranking that is used
63 * to determine the set of BTLs that may be used to reach a given destination.
64 * During startup the BTL modules are queried via their
65 * mca_btl_base_add_proc_fn_t() to determine if they are able to reach
66 * a given destination. The BTL module with the highest ranking that
67 * returns success is selected. Subsequent BTL modules are selected only
68 * if they have the same exclusivity ranking.
69 *
70 * An example of how this might be used:
71 *
72 * BTL Exclusivity Comments
73 * -------- ----------- ------------------
74 * LO 100 Selected exclusively for local process
75 * SM 50 Selected exclusively for other processes on host
76 * IB 0 Selected based on network reachability
77 * IB 0 Selected based on network reachability
78 * TCP 0 Selected based on network reachability
79 * TCP 0 Selected based on network reachability
80 *
81 * When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL
82 * will populate an OUT variable with mca_btl_base_endpoint_t pointers.
83 * Each pointer is treated as an opaque handle by the upper layer and is
84 * returned to the BTL on subsequent data transfer calls to the
85 * corresponding destination process. The actual contents of the
86 * data structure are defined on a per BTL basis, and may be used to
87 * cache addressing or connection information, such as a TCP socket
88 * or IB queue pair.
89 *
90 * Progress:
91 *
92 * By default, the library provides for polling based progress of outstanding
93 * requests. The BTL component exports an interface function (btl_progress)
94 * that is called in a polling mode by the PML during calls into the MPI
95 * library. Note that the btl_progress() function is called on the BTL component
96 * rather than each BTL module. This implies that the BTL author is responsible
97 * for iterating over the pending operations in each of the BTL modules associated
98 * with the component.
99 *
100 * On platforms where threading support is provided, the library provides the
101 * option of building with asynchronous threaded progress. In this case, the BTL
102 * author is responsible for providing a thread to progress pending operations.
103 * A thread is associated with the BTL component/module such that transport specific
104 * functionality/APIs may be used to block the thread until a pending operation
105 * completes. This thread MUST NOT poll for completion as this would oversubscribe
106 * the CPU.
107 *
108 * Note that in the threaded case the PML may choose to use a hybrid approach,
109 * such that polling is implemented from the user thread for a fixed number of
110 * cycles before relying on the background thread(s) to complete requests. If
111 * possible the BTL should support the use of both modes concurrently.
112 *
113 */
114
115 #ifndef OPAL_MCA_BTL_H
116 #define OPAL_MCA_BTL_H
117
118 #include "opal_config.h"
119 #include "opal/types.h"
120 #include "opal/prefetch.h" /* For OPAL_LIKELY */
121 #include "opal/class/opal_bitmap.h"
122 #include "opal/datatype/opal_convertor.h"
123 #include "opal/mca/mca.h"
124 #include "opal/mca/mpool/mpool.h"
125 #include "opal/mca/rcache/rcache.h"
126 #include "opal/mca/crs/crs.h"
127 #include "opal/mca/crs/base/base.h"
128
129 BEGIN_C_DECLS
130
131 /*
132 * BTL types
133 */
134
135 struct mca_btl_base_module_t;
136 struct mca_btl_base_endpoint_t;
137 struct mca_btl_base_descriptor_t;
138 struct mca_mpool_base_resources_t;
139 struct opal_proc_t;
140
141 /**
142 * Opaque registration handle for executing RDMA and atomic
143 * operations on a memory region.
144 *
145 * This data inside this handle is appropriate for passing
146 * to remote peers to execute RDMA and atomic operations. The
147 * size needed to send the registration handle can be
148 * obtained from the btl via the btl_registration_handle_size
149 * member. If this size is 0 then no registration data is
150 * needed to execute RDMA or atomic operations.
151 */
152 struct mca_btl_base_registration_handle_t;
153 typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
154
155
156 /* Wildcard endpoint for use in the register_mem function */
157 #define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
158
159 /* send/recv operations require tag matching */
160 typedef uint8_t mca_btl_base_tag_t;
161
162 #define MCA_BTL_NO_ORDER 255
163
164 /*
165 * Communication specific defines. There are a number of active message ID
166 * that can be shred between all frameworks that need to communicate (i.e.
167 * use the PML or the BTL directly). These ID are exchanged between the
168 * processes, therefore they need to be identical everywhere. The simplest
169 * approach is to have them defined as constants, and give each framework a
170 * small number. Here is the rule that defines these ID (they are 8 bits):
171 * - the first 3 bits are used to code the framework (i.e. PML, OSC, COLL)
172 * - the remaining 5 bytes are used internally by the framework, and divided
173 * based on the components requirements. Therefore, the way the PML and
174 * the OSC frameworks use these defines will be different. For more
175 * information about how these framework ID are defined, take a look in the
176 * header file associated with the framework.
177 */
178 #define MCA_BTL_AM_FRAMEWORK_MASK 0xD0
179 #define MCA_BTL_TAG_BTL 0x20
180 #define MCA_BTL_TAG_PML 0x40
181 #define MCA_BTL_TAG_OSC_RDMA 0x60
182 #define MCA_BTL_TAG_USR 0x80
183 #define MCA_BTL_TAG_MAX 255 /* 1 + highest allowed tag num */
184
185 /*
186 * Reserved tags for specific BTLs. As multiple BTLs can be active
187 * simultaneously, their tags should not collide.
188 */
189 #define MCA_BTL_TAG_IB (MCA_BTL_TAG_BTL + 0)
190 #define MCA_BTL_TAG_UDAPL (MCA_BTL_TAG_BTL + 1)
191 #define MCA_BTL_TAG_SMCUDA (MCA_BTL_TAG_BTL + 2)
192 #define MCA_BTL_TAG_VADER (MCA_BTL_TAG_BTL + 3)
193
194 /* prefered protocol */
195 #define MCA_BTL_FLAGS_SEND 0x0001
196 #define MCA_BTL_FLAGS_PUT 0x0002
197 #define MCA_BTL_FLAGS_GET 0x0004
198 /* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
199 * rdma_btls list. This allows the updated one-sided component to
200 * use btls that are not otherwise used for send/recv. */
201 #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
202
203 /* btl can send directly from user buffer w/out registration */
204 #define MCA_BTL_FLAGS_SEND_INPLACE 0x0008
205
206 /* btl transport reliability flags - currently used only by the DR PML */
207 #define MCA_BTL_FLAGS_NEED_ACK 0x0010
208 #define MCA_BTL_FLAGS_NEED_CSUM 0x0020
209
210 /** deprecated (BTL 3.0) */
211 #define MCA_BTL_FLAGS_RDMA_MATCHED 0x0040
212
213 /* btl needs local rdma completion */
214 #define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080
215
216 /* btl can do heterogeneous rdma operations on byte buffers */
217 #define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100
218
219 /* btl can support failover if enabled */
220 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200
221
222 #define MCA_BTL_FLAGS_CUDA_PUT 0x0400
223 #define MCA_BTL_FLAGS_CUDA_GET 0x0800
224 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
225 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
226 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
227
228 /* btl can support signaled operations. BTLs that support this flag are
229 * expected to provide a mechanism for asynchronous progress on descriptors
230 * where the feature is requested. BTLs should also be aware that users can
231 * (and probably will) turn this flag on and off using the MCA variable
232 * system.
233 */
234 #define MCA_BTL_FLAGS_SIGNALED 0x4000
235
236 /** The BTL supports network atomic operations */
237 #define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000
238 /** The BTL supports fetching network atomic operations */
239 #define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000
240
241 /** The BTL requires add_procs to be with all procs including non-local. Shared-memory
242 * BTLs should not set this flag. */
243 #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000
244
245 /* The BTL is using progress thread and need the protection on matching */
246 #define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000
247
248 /* The BTL supports RMDA flush */
249 #define MCA_BTL_FLAGS_RDMA_FLUSH 0x80000
250
251 /* Default exclusivity levels */
252 #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
253 #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
254 #define MCA_BTL_EXCLUSIVITY_LOW 0 /* TCP used as a last resort */
255
256 /* error callback flags */
257 #define MCA_BTL_ERROR_FLAGS_FATAL 0x1
258 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
259 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
260
261 /** registration flags. the access flags are a 1-1 mapping with the mpool
262 * access flags. */
263 enum {
264 /** Allow local write on the registered region. If a region is registered
265 * with this flag the registration can be used as the local handle for a
266 * btl_get operation. */
267 MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_RCACHE_ACCESS_LOCAL_WRITE,
268 /** Allow remote read on the registered region. If a region is registered
269 * with this flag the registration can be used as the remote handle for a
270 * btl_get operation. */
271 MCA_BTL_REG_FLAG_REMOTE_READ = MCA_RCACHE_ACCESS_REMOTE_READ,
272 /** Allow remote write on the registered region. If a region is registered
273 * with this flag the registration can be used as the remote handle for a
274 * btl_put operation. */
275 MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_RCACHE_ACCESS_REMOTE_WRITE,
276 /** Allow remote atomic operations on the registered region. If a region is
277 * registered with this flag the registration can be used as the remote
278 * handle for a btl_atomic_op or btl_atomic_fop operation. */
279 MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC,
280 /** Allow any btl operation on the registered region. If a region is registered
281 * with this flag the registration can be used as the local or remote handle for
282 * any btl operation. */
283 MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_RCACHE_ACCESS_ANY,
284 #if OPAL_CUDA_GDR_SUPPORT
285 /** Region is in GPU memory */
286 MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000,
287 #endif
288 };
289
290 /** supported atomic operations */
291 enum {
292 /** The btl supports atomic add */
293 MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001,
294 /** The btl supports atomic bitwise and */
295 MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200,
296 /** The btl supports atomic bitwise or */
297 MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400,
298 /** The btl supports atomic bitwise exclusive or */
299 MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800,
300
301 /** The btl supports logical and */
302 MCA_BTL_ATOMIC_SUPPORTS_LAND = 0x00001000,
303 /** The btl supports logical or */
304 MCA_BTL_ATOMIC_SUPPORTS_LOR = 0x00002000,
305 /** The btl supports logical exclusive or */
306 MCA_BTL_ATOMIC_SUPPORTS_LXOR = 0x00004000,
307
308 /** The btl supports atomic swap */
309 MCA_BTL_ATOMIC_SUPPORTS_SWAP = 0x00010000,
310
311 /** The btl supports atomic min */
312 MCA_BTL_ATOMIC_SUPPORTS_MIN = 0x00100000,
313 /** The btl supports atomic min */
314 MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000,
315
316 /** The btl supports 32-bit integer operations. Keep in mind the btl may
317 * support only a subset of the available atomics. */
318 MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x01000000,
319
320 /** The btl supports floating-point operations. Keep in mind the btl may
321 * support only a subset of the available atomics and may not support
322 * both 64 or 32-bit floating point. */
323 MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x02000000,
324
325 /** The btl supports atomic compare-and-swap */
326 MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000,
327
328 /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
329 MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000,
330 };
331
332 enum {
333 /** Use 32-bit atomics */
334 MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001,
335 /** Use floating-point atomics */
336 MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002,
337 };
338
339 enum mca_btl_base_atomic_op_t {
340 /** Atomic add: (*remote_address) = (*remote_address) + operand */
341 MCA_BTL_ATOMIC_ADD = 0x0001,
342 /** Atomic and: (*remote_address) = (*remote_address) & operand */
343 MCA_BTL_ATOMIC_AND = 0x0011,
344 /** Atomic or: (*remote_address) = (*remote_address) | operand */
345 MCA_BTL_ATOMIC_OR = 0x0012,
346 /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
347 MCA_BTL_ATOMIC_XOR = 0x0014,
348 /** Atomic logical and: (*remote_address) = (*remote_address) && operand */
349 MCA_BTL_ATOMIC_LAND = 0x0015,
350 /** Atomic logical or: (*remote_address) = (*remote_address) || operand */
351 MCA_BTL_ATOMIC_LOR = 0x0016,
352 /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */
353 MCA_BTL_ATOMIC_LXOR = 0x0017,
354 /** Atomic swap: (*remote_address) = operand */
355 MCA_BTL_ATOMIC_SWAP = 0x001a,
356 /** Atomic min */
357 MCA_BTL_ATOMIC_MIN = 0x0020,
358 /** Atomic max */
359 MCA_BTL_ATOMIC_MAX = 0x0021,
360
361 MCA_BTL_ATOMIC_LAST,
362 };
363 typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
364
365 /**
366 * Asynchronous callback function on completion of an operation.
367 * Completion Semantics: The descriptor can be reused or returned to the
368 * BTL via mca_btl_base_module_free_fn_t. The operation has been queued to
369 * the network device or will otherwise make asynchronous progress without
370 * subsequent calls to btl_progress.
371 *
372 * @param[IN] module the BTL module
373 * @param[IN] endpoint the BTL endpoint
374 * @param[IN] descriptor the BTL descriptor
375 *
376 */
377 typedef void (*mca_btl_base_completion_fn_t)(
378 struct mca_btl_base_module_t* module,
379 struct mca_btl_base_endpoint_t* endpoint,
380 struct mca_btl_base_descriptor_t* descriptor,
381 int status);
382
383
384 /**
385 * Asynchronous callback function on completion of an rdma or atomic operation.
386 * Completion Semantics: The rdma or atomic memory operation has completed
387 * remotely (i.e.) is remotely visible and the caller is free to deregister
388 * the local_handle or modify the memory in local_address.
389 *
390 * @param[IN] module the BTL module
391 * @param[IN] endpoint the BTL endpoint
392 * @param[IN] local_address local address for the operation (if any)
393 * @param[IN] local_handle local handle associated with the local_address
394 * @param[IN] context callback context supplied to the rdma/atomic operation
395 * @param[IN] cbdata callback data supplied to the rdma/atomic operation
396 * @param[IN] status status of the operation
397 *
398 */
399 typedef void (*mca_btl_base_rdma_completion_fn_t)(
400 struct mca_btl_base_module_t* module,
401 struct mca_btl_base_endpoint_t* endpoint,
402 void *local_address,
403 struct mca_btl_base_registration_handle_t *local_handle,
404 void *context,
405 void *cbdata,
406 int status);
407
408
409 /**
410 * Describes a region/segment of memory that is addressable
411 * by an BTL.
412 *
413 * Note: In many cases the alloc and prepare methods of BTLs
414 * do not return a mca_btl_base_segment_t but instead return a
415 * subclass. Extreme care should be used when modifying
416 * BTL segments to prevent overwriting internal BTL data.
417 *
418 * All BTLs MUST use base segments when calling registered
419 * Callbacks.
420 *
421 * BTL MUST use mca_btl_base_segment_t or a subclass and
422 * MUST store their segment length in btl_seg_size. BTLs
423 * MUST specify a segment no larger than MCA_BTL_SEG_MAX_SIZE.
424 */
425
426 struct mca_btl_base_segment_t {
427 /** Address of the memory */
428 opal_ptr_t seg_addr;
429 /** Length in bytes */
430 uint64_t seg_len;
431 };
432 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
433
434
435 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && !defined(WORDS_BIGENDIAN)
436 #define MCA_BTL_BASE_SEGMENT_HTON(s) \
437 (s).seg_addr.lval = hton64((s).seg_addr.lval); \
438 (s).seg_len = hton64((s).seg_len);
439 #define MCA_BTL_BASE_SEGMENT_NTOH(s) \
440 (s).seg_addr.lval = ntoh64((s).seg_addr.lval); \
441 (s).seg_len = ntoh64((s).seg_len);
442 #else
443 #define MCA_BTL_BASE_SEGMENT_HTON(s)
444 #define MCA_BTL_BASE_SEGMENT_NTOH(s)
445 #endif
446 /**
447 * A descriptor that holds the parameters to a send/put/get
448 * operation along w/ a callback routine that is called on
449 * completion of the request.
450 * Note: receive callbacks will store the incomming data segments in
451 * des_segments
452 */
453
454 struct mca_btl_base_descriptor_t {
455 opal_free_list_item_t super;
456 mca_btl_base_segment_t *des_segments; /**< local segments */
457 size_t des_segment_count; /**< number of local segments */
458 mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
459 void* des_cbdata; /**< opaque callback data */
460 void* des_context; /**< more opaque callback data */
461 uint32_t des_flags; /**< hints to BTL */
462 /** order value, this is only
463 valid in the local completion callback
464 and may be used in subsequent calls to
465 btl_alloc, btl_prepare_src to request
466 a descriptor that will be ordered w.r.t.
467 this descriptor
468 */
469 uint8_t order;
470 };
471 typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
472
473 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
474
475 #define MCA_BTL_DES_FLAGS_PRIORITY 0x0001
476 /* Allow the BTL to dispose the descriptor once the callback
477 * associated was triggered.
478 */
479 #define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002
480 /* Allow the BTL to avoid calling the descriptor callback
481 * if the send succeded in the btl_send (i.e in the fast path).
482 */
483 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004
484
485 /* Tell the PML that the copy is being done asynchronously
486 */
487 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008
488
489 /* Type of transfer that will be done with this frag.
490 */
491 #define MCA_BTL_DES_FLAGS_PUT 0x0010
492 #define MCA_BTL_DES_FLAGS_GET 0x0020
493
494 /* Ask the BTL to wake the remote process (send/sendi) or local process
495 * (put/get) to handle this message. The BTL may ignore this flag if
496 * signaled operations are not supported.
497 */
498 #define MCA_BTL_DES_FLAGS_SIGNAL 0x0040
499
500 /**
501 * Maximum number of allowed segments in src/dst fields of a descriptor.
502 */
503 #define MCA_BTL_DES_MAX_SEGMENTS 16
504
505 /**
506 * Maximum size of a BTL segment (NTH: does it really save us anything
507 * to hardcode this?)
508 */
509 #define MCA_BTL_SEG_MAX_SIZE 256
510
511 /**
512 * Maximum size of a BTL registration handle in bytes
513 */
514 #define MCA_BTL_REG_HANDLE_MAX_SIZE 256
515
516 /*
517 * BTL base header, stores the tag at a minimum
518 */
519 struct mca_btl_base_header_t{
520 mca_btl_base_tag_t tag;
521 };
522 typedef struct mca_btl_base_header_t mca_btl_base_header_t;
523
524 #define MCA_BTL_BASE_HEADER_HTON(hdr)
525 #define MCA_BTL_BASE_HEADER_NTOH(hdr)
526
527 /*
528 * BTL component interface functions and datatype.
529 */
530
531 /**
532 * MCA->BTL Initializes the BTL component and creates specific BTL
533 * module(s).
534 *
535 * @param num_btls (OUT) Returns the number of btl modules created, or 0
536 * if the transport is not available.
537 *
538 * @param enable_progress_threads (IN) Whether this component is
539 * allowed to run a hidden/progress thread or not.
540 *
541 * @param enable_mpi_threads (IN) Whether support for multiple MPI
542 * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
543 * indicates whether multiple threads may invoke this component
544 * simultaneously or not.
545 *
546 * @return Array of pointers to BTL modules, or NULL if the transport
547 * is not available.
548 *
549 * During component initialization, the BTL component should discover
550 * the physical devices that are available for the given transport,
551 * and create a BTL module to represent each device. Any addressing
552 * information required by peers to reach the device should be published
553 * during this function via the modex_send() interface.
554 *
555 */
556
557 typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
558 int *num_btls,
559 bool enable_progress_threads,
560 bool enable_mpi_threads
561 );
562
563 /**
564 * MCA->BTL Called to progress outstanding requests for
565 * non-threaded polling environments.
566 *
567 * @return Count of "completions", a metric of
568 * how many items where completed in the call
569 * to progress.
570 */
571
572 typedef int (*mca_btl_base_component_progress_fn_t)(void);
573
574
575 /**
576 * Callback function that is called asynchronously on receipt
577 * of data by the transport layer.
578 * Note that the the mca_btl_base_descriptor_t is only valid within the
579 * completion function, this implies that all data payload in the
580 * mca_btl_base_descriptor_t must be copied out within this callback or
581 * forfeited back to the BTL.
582 * Note also that descriptor segments (des_segments) must be base
583 * segments for all callbacks.
584 *
585 * @param[IN] btl BTL module
586 * @param[IN] tag The active message receive callback tag value
587 * @param[IN] descriptor The BTL descriptor (contains the receive payload)
588 * @param[IN] cbdata Opaque callback data
589 */
590
591 typedef void (*mca_btl_base_module_recv_cb_fn_t)(
592 struct mca_btl_base_module_t* btl,
593 mca_btl_base_tag_t tag,
594 mca_btl_base_descriptor_t* descriptor,
595 void* cbdata
596 );
597
598 typedef struct mca_btl_active_message_callback_t {
599 mca_btl_base_module_recv_cb_fn_t cbfunc;
600 void* cbdata;
601 } mca_btl_active_message_callback_t;
602
603 OPAL_DECLSPEC extern
604 mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TAG_MAX];
605
606 /**
607 * BTL component descriptor. Contains component version information
608 * and component open/close/init functions.
609 */
610
611 struct mca_btl_base_component_3_0_0_t {
612 mca_base_component_t btl_version;
613 mca_base_component_data_t btl_data;
614 mca_btl_base_component_init_fn_t btl_init;
615 mca_btl_base_component_progress_fn_t btl_progress;
616 };
617 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t;
618 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t;
619
620 /* add the 2_0_0_t typedef for source compatibility
621 * we can do this safely because 2_0_0 components are the same as
622 * 3_0_0 components, the difference is in the btl module.
623 * Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and
624 * can not be used with the new interface.
625 */
626 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t;
627
628
629 /*
630 * BTL module interface functions and datatype.
631 */
632
633 /**
634 * MCA->BTL Clean up any resources held by BTL module
635 * before the module is unloaded.
636 *
637 * @param btl (IN) BTL module.
638 * @return OPAL_SUCCESS or error status on failure.
639 *
640 * Prior to unloading a BTL module, the MCA framework will call
641 * the BTL finalize method of the module. Any resources held by
642 * the BTL should be released and if required the memory corresponding
643 * to the BTL module freed.
644 *
645 */
646 typedef int (*mca_btl_base_module_finalize_fn_t)(
647 struct mca_btl_base_module_t* btl
648 );
649
650 /**
651 * BML->BTL notification of change in the process list.
652 *
653 * @param btl (IN) BTL module
654 * @param nprocs (IN) Number of processes
655 * @param procs (IN) Array of processes
656 * @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL.
657 * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL.
658 * @return OPAL_SUCCESS or error status on failure.
659 *
660 * The mca_btl_base_module_add_procs_fn_t() is called by the BML to
661 * determine the set of BTLs that should be used to reach each process.
662 * Any addressing information exported by the peer via the modex_send()
663 * function should be available during this call via the corresponding
664 * modex_recv() function. The BTL may utilize this information to
665 * determine reachability of each peer process.
666 *
667 * The caller may pass a "reachable" bitmap pointer. If it is not
668 * NULL, for each process that is reachable by the BTL, the bit
669 * corresponding to the index into the proc array (nprocs) should be
670 * set in the reachable bitmask. The BTL will return an array of
671 * pointers to a data structure defined by the BTL that is then
672 * returned to the BTL on subsequent calls to the BTL data transfer
673 * functions (e.g btl_send). This may be used by the BTL to cache any
674 * addressing or connection information (e.g. TCP socket, IB queue
675 * pair).
676 */
677 typedef int (*mca_btl_base_module_add_procs_fn_t)(
678 struct mca_btl_base_module_t* btl,
679 size_t nprocs,
680 struct opal_proc_t** procs,
681 struct mca_btl_base_endpoint_t** endpoints,
682 struct opal_bitmap_t* reachable
683 );
684
685 /**
686 * Notification of change to the process list.
687 *
688 * @param btl (IN) BTL module
689 * @param nprocs (IN) Number of processes
690 * @param proc (IN) Set of processes
691 * @param peer (IN) Set of peer addressing information.
692 * @return Status indicating if cleanup was successful
693 *
694 * When the process list changes, the BML notifies the BTL of the
695 * change, to provide the opportunity to cleanup or release any
696 * resources associated with the peer.
697 */
698 typedef int (*mca_btl_base_module_del_procs_fn_t)(
699 struct mca_btl_base_module_t* btl,
700 size_t nprocs,
701 struct opal_proc_t** procs,
702 struct mca_btl_base_endpoint_t** peer
703 );
704
705 /**
706 * Register a callback function that is called on receipt
707 * of a fragment.
708 *
709 * @param[IN] btl BTL module
710 * @param[IN] tag tag value of this callback
711 * (specified on subsequent send operations)
712 * @param[IN] cbfunc The callback function
713 * @param[IN] cbdata Opaque callback data
714 *
715 * @return OPAL_SUCCESS The callback was registered successfully
716 * @return OPAL_ERROR The callback was NOT registered successfully
717 *
718 */
719 typedef int (*mca_btl_base_module_register_fn_t)(
720 struct mca_btl_base_module_t* btl,
721 mca_btl_base_tag_t tag,
722 mca_btl_base_module_recv_cb_fn_t cbfunc,
723 void* cbdata
724 );
725
726
727 /**
728 * Callback function that is called asynchronously on receipt
729 * of an error from the transport layer
730 *
731 * @param[IN] btl BTL module
732 * @param[IN] flags type of error
733 * @param[IN] errproc process that had an error
734 * @param[IN] btlinfo descriptive string from the BTL
735 */
736
737 typedef void (*mca_btl_base_module_error_cb_fn_t)(
738 struct mca_btl_base_module_t* btl,
739 int32_t flags,
740 struct opal_proc_t* errproc,
741 char* btlinfo
742 );
743
744
745 /**
746 * Register a callback function that is called on receipt
747 * of an error.
748 *
749 * @param[IN] btl BTL module
750 * @param[IN] cbfunc The callback function
751 *
752 * @return OPAL_SUCCESS The callback was registered successfully
753 * @return OPAL_ERROR The callback was NOT registered successfully
754 *
755 */
756 typedef int (*mca_btl_base_module_register_error_fn_t)(
757 struct mca_btl_base_module_t* btl,
758 mca_btl_base_module_error_cb_fn_t cbfunc
759 );
760
761
762 /**
763 * Allocate a descriptor with a segment of the requested size.
764 * Note that the BTL layer may choose to return a smaller size
765 * if it cannot support the request. The order tag value ensures that
766 * operations on the descriptor that is allocated will be
767 * ordered w.r.t. a previous operation on a particular descriptor.
768 * Ordering is only guaranteed if the previous descriptor had its
769 * local completion callback function called and the order tag of
770 * that descriptor is only valid upon the local completion callback function.
771 *
772 *
773 * @param btl (IN) BTL module
774 * @param size (IN) Request segment size.
775 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER)
776 */
777
778 typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)(
779 struct mca_btl_base_module_t* btl,
780 struct mca_btl_base_endpoint_t* endpoint,
781 uint8_t order,
782 size_t size,
783 uint32_t flags
784 );
785
786 /**
787 * Return a descriptor allocated from this BTL via alloc/prepare.
788 * A descriptor can only be deallocated after its local completion
789 * callback function has called for all send/put/get operations.
790 *
791 * @param btl (IN) BTL module
792 * @param segment (IN) Descriptor allocated from the BTL
793 */
794 typedef int (*mca_btl_base_module_free_fn_t)(
795 struct mca_btl_base_module_t* btl,
796 mca_btl_base_descriptor_t* descriptor
797 );
798
799
800 /**
801 * Prepare a descriptor for send using the supplied convertor. If the convertor
802 * references data that is contiguous, the descriptor may simply point to the
803 * user buffer. Otherwise, this routine is responsible for allocating buffer
804 * space and packing if required.
805 *
806 * The order tag value ensures that operations on the
807 * descriptor that is prepared will be ordered w.r.t. a previous
808 * operation on a particular descriptor. Ordering is only guaranteed if
809 * the previous descriptor had its local completion callback function
810 * called and the order tag of that descriptor is only valid upon the local
811 * completion callback function.
812 *
813 * @param btl (IN) BTL module
814 * @param endpoint (IN) BTL peer addressing
815 * @param registration (IN) Memory registration
816 * @param convertor (IN) Data type convertor
817 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER)
818 * @param reserve (IN) Additional bytes requested by upper layer to precede user data
819 * @param size (IN/OUT) Number of bytes to prepare (IN),
820 * number of bytes actually prepared (OUT)
821 *
822 */
823 typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
824 struct mca_btl_base_module_t* btl,
825 struct mca_btl_base_endpoint_t* endpoint,
826 struct opal_convertor_t* convertor,
827 uint8_t order,
828 size_t reserve,
829 size_t* size,
830 uint32_t flags
831 );
832
833 /**
834 * @brief Register a memory region for put/get/atomic operations.
835 *
836 * @param btl (IN) BTL module
837 * @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
838 * @param base (IN) Pointer to start of region
839 * @param size (IN) Size of region
840 * @param flags (IN) Flags including access permissions
841 *
842 * @returns a memory registration handle valid for both local and remote operations
843 * @returns NULL if the region could not be registered
844 *
845 * This function registers the specified region with the hardware for use with
846 * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
847 * functions. Care should be taken to not hold an excessive number of registrations
848 * as they may use limited system/NIC resources.
849 *
850 * Ownership of the memory pointed to by the returned (struct
851 * mca_btl_base_registration_handle_t*) is passed to the caller. The
852 * BTL module cannot free or reuse the handle until it is returned via
853 * the mca_btl_base_module_deregister_mem_fn_t function.
854 */
855 typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
856 struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
857 size_t size, uint32_t flags);
858
859 /**
860 * @brief Deregister a memory region
861 *
862 * @param btl (IN) BTL module region was registered with
863 * @param handle (IN) BTL registration handle to deregister
864 *
865 * This function deregisters the memory region associated with the specified handle. Care
866 * should be taken to not perform any RDMA or atomic operation on this memory region
867 * after it is deregistered. It is erroneous to specify a memory handle associated with
868 * a remote node.
869 *
870 * The handle passed in will be a value previously returned by the
871 * mca_btl_base_module_register_mem_fn_t function. Ownership of the
872 * memory pointed to by handle passes to the BTL module; this function
873 * is now is allowed to free the memory, return it to a freelist, etc.
874 */
875 typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
876 struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
877
878 /**
879 * Initiate an asynchronous send.
880 * Completion Semantics: the descriptor has been queued for a send operation
881 * the BTL now controls the descriptor until local
882 * completion callback is made on the descriptor
883 *
884 * All BTLs allow multiple concurrent asynchronous send operations on a descriptor
885 *
886 * @param btl (IN) BTL module
887 * @param endpoint (IN) BTL addressing information
888 * @param descriptor (IN) Description of the data to be transfered
889 * @param tag (IN) The tag value used to notify the peer.
890 *
891 * @retval OPAL_SUCCESS The descriptor was successfully queued for a send
892 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a send
893 * @retval OPAL_ERR_UNREACH The endpoint is not reachable
894 */
895 typedef int (*mca_btl_base_module_send_fn_t)(
896 struct mca_btl_base_module_t* btl,
897 struct mca_btl_base_endpoint_t* endpoint,
898 struct mca_btl_base_descriptor_t* descriptor,
899 mca_btl_base_tag_t tag
900 );
901
902 /**
903 * Initiate an immediate blocking send.
904 * Completion Semantics: the BTL will make a best effort
905 * to send the header and "size" bytes from the datatype using the convertor.
906 * The header is guaranteed to be delivered entirely in the first segment.
907 * Should the BTL be unable to deliver the data due to resource constraints
908 * the BTL will return a descriptor (via the OUT param)
909 * of size "payload_size + header_size".
910 *
911 * @param btl (IN) BTL module
912 * @param endpoint (IN) BTL addressing information
913 * @param convertor (IN) Data type convertor
914 * @param header (IN) Pointer to header.
915 * @param header_size (IN) Size of header.
916 * @param payload_size (IN) Size of payload (from convertor).
917 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER)
918 * @param flags (IN) Flags.
919 * @param tag (IN) The tag value used to notify the peer.
920 * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
921 * (may be NULL).
922 *
923 * @retval OPAL_SUCCESS The send was successfully queued
924 * @retval OPAL_ERROR The send failed
925 * @retval OPAL_ERR_UNREACH The endpoint is not reachable
926 * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
927 * (via the OUT param) if descriptors are available
928 */
929
930 typedef int (*mca_btl_base_module_sendi_fn_t)(
931 struct mca_btl_base_module_t* btl,
932 struct mca_btl_base_endpoint_t* endpoint,
933 struct opal_convertor_t* convertor,
934 void* header,
935 size_t header_size,
936 size_t payload_size,
937 uint8_t order,
938 uint32_t flags,
939 mca_btl_base_tag_t tag,
940 mca_btl_base_descriptor_t** descriptor
941 );
942
943 /**
944 * Initiate an asynchronous put.
945 * Completion Semantics: if this function returns a 1 then the operation
946 * is complete. a return of OPAL_SUCCESS indicates
947 * the put operation has been queued with the
948 * network. the local_handle can not be deregistered
949 * until all outstanding operations on that handle
950 * have been completed.
951 *
952 * @param btl (IN) BTL module
953 * @param endpoint (IN) BTL addressing information
954 * @param local_address (IN) Local address to put from (registered)
955 * @param remote_address (IN) Remote address to put to (registered remotely)
956 * @param local_handle (IN) Registration handle for region containing
957 * (local_address, local_address + size)
958 * @param remote_handle (IN) Remote registration handle for region containing
959 * (remote_address, remote_address + size)
960 * @param size (IN) Number of bytes to put
961 * @param flags (IN) Flags for this put operation
962 * @param order (IN) Ordering
963 * @param cbfunc (IN) Function to call on completion (if queued)
964 * @param cbcontext (IN) Context for the callback
965 * @param cbdata (IN) Data for callback
966 *
967 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put
968 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
969 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
970 * operation. Try again later
971 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
972 * alignment restrictions.
973 */
974 typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
975 struct mca_btl_base_endpoint_t *endpoint, void *local_address,
976 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
977 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
978 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
979
980 /**
981 * Initiate an asynchronous get.
982 * Completion Semantics: if this function returns a 1 then the operation
983 * is complete. a return of OPAL_SUCCESS indicates
984 * the get operation has been queued with the
985 * network. the local_handle can not be deregistered
986 * until all outstanding operations on that handle
987 * have been completed.
988 *
989 * @param btl (IN) BTL module
990 * @param endpoint (IN) BTL addressing information
991 * @param local_address (IN) Local address to put from (registered)
992 * @param remote_address (IN) Remote address to put to (registered remotely)
993 * @param local_handle (IN) Registration handle for region containing
994 * (local_address, local_address + size)
995 * @param remote_handle (IN) Remote registration handle for region containing
996 * (remote_address, remote_address + size)
997 * @param size (IN) Number of bytes to put
998 * @param flags (IN) Flags for this put operation
999 * @param order (IN) Ordering
1000 * @param cbfunc (IN) Function to call on completion (if queued)
1001 * @param cbcontext (IN) Context for the callback
1002 * @param cbdata (IN) Data for callback
1003 *
1004 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put
1005 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
1006 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
1007 * operation. Try again later
1008 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
1009 * alignment restrictions.
1010 */
1011 typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
1012 struct mca_btl_base_endpoint_t *endpoint, void *local_address,
1013 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
1014 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
1015 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
1016
1017 /**
1018 * Initiate an asynchronous atomic operation.
1019 * Completion Semantics: if this function returns a 1 then the operation
1020 * is complete. a return of OPAL_SUCCESS indicates
1021 * the atomic operation has been queued with the
1022 * network.
1023 *
1024 * @param btl (IN) BTL module
1025 * @param endpoint (IN) BTL addressing information
1026 * @param remote_address (IN) Remote address to put to (registered remotely)
1027 * @param remote_handle (IN) Remote registration handle for region containing
1028 * (remote_address, remote_address + 8)
1029 * @param op (IN) Operation to perform
1030 * @param operand (IN) Operand for the operation
1031 * @param flags (IN) Flags for this atomic operation
1032 * @param order (IN) Ordering
1033 * @param cbfunc (IN) Function to call on completion (if queued)
1034 * @param cbcontext (IN) Context for the callback
1035 * @param cbdata (IN) Data for callback
1036 *
1037 * @retval OPAL_SUCCESS The operation was successfully queued
1038 * @retval 1 The operation is complete
1039 * @retval OPAL_ERROR The operation was NOT successfully queued
1040 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
1041 * operation. Try again later
1042 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
1043 * alignment restrictions or the operation {op} is not supported
1044 * by the hardware.
1045 *
1046 * After the operation is complete the remote address specified by {remote_address} and
1047 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1048 * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1049 * however, that not all btls will provide consistency between btl atomic operations and
1050 * cpu or other btl atomics.
1051 */
1052 typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
1053 struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
1054 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1055 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1056 void *cbcontext, void *cbdata);
1057
1058 /**
1059 * Initiate an asynchronous fetching atomic operation.
1060 * Completion Semantics: if this function returns a 1 then the operation
1061 * is complete. a return of OPAL_SUCCESS indicates
1062 * the atomic operation has been queued with the
1063 * network.
1064 *
1065 * @param btl (IN) BTL module
1066 * @param endpoint (IN) BTL addressing information
1067 * @param local_address (OUT) Local address to store the result in
1068 * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1069 * @param local_handle (IN) Local registration handle for region containing
1070 * (local_address, local_address + 8)
1071 * @param remote_handle (IN) Remote registration handle for region containing
1072 * (remote_address, remote_address + 8)
1073 * @param op (IN) Operation to perform
1074 * @param operand (IN) Operand for the operation
1075 * @param flags (IN) Flags for this atomic operation
1076 * @param order (IN) Ordering
1077 * @param cbfunc (IN) Function to call on completion (if queued)
1078 * @param cbcontext (IN) Context for the callback
1079 * @param cbdata (IN) Data for callback
1080 *
1081 * @retval OPAL_SUCCESS The operation was successfully queued
1082 * @retval 1 The operation is complete
1083 * @retval OPAL_ERROR The operation was NOT successfully queued
1084 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
1085 * operation. Try again later
1086 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
1087 * alignment restrictions or the operation {op} is not supported
1088 * by the hardware.
1089 *
1090 * After the operation is complete the remote address specified by {remote_address} and
1091 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1092 * {local_address} will be updated with the previous value stored in {remote_address}.
1093 * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1094 * however, that not all btls will provide consistency between btl atomic operations and
1095 * cpu or other btl atomics.
1096 */
1097 typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
1098 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1099 struct mca_btl_base_registration_handle_t *local_handle,
1100 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1101 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1102 void *cbcontext, void *cbdata);
1103
1104 /**
1105 * Initiate an asynchronous compare and swap operation.
1106 * Completion Semantics: if this function returns a 1 then the operation
1107 * is complete. a return of OPAL_SUCCESS indicates
1108 * the atomic operation has been queued with the
1109 * network.
1110 *
1111 * @param btl (IN) BTL module
1112 * @param endpoint (IN) BTL addressing information
1113 * @param local_address (OUT) Local address to store the result in
1114 * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1115 * @param local_handle (IN) Local registration handle for region containing
1116 * (local_address, local_address + 8)
1117 * @param remote_handle (IN) Remote registration handle for region containing
1118 * (remote_address, remote_address + 8)
1119 * @param compare (IN) Operand for the operation
1120 * @param value (IN) Value to store on success
1121 * @param flags (IN) Flags for this atomic operation
1122 * @param order (IN) Ordering
1123 * @param cbfunc (IN) Function to call on completion (if queued)
1124 * @param cbcontext (IN) Context for the callback
1125 * @param cbdata (IN) Data for callback
1126 *
1127 * @retval OPAL_SUCCESS The operation was successfully queued
1128 * @retval 1 The operation is complete
1129 * @retval OPAL_ERROR The operation was NOT successfully queued
1130 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
1131 * operation. Try again later
1132 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
1133 * alignment restrictions or the operation {op} is not supported
1134 * by the hardware.
1135 *
1136 * After the operation is complete the remote address specified by {remote_address} and
1137 * {remote_handle} will be updated with {value} if *remote_address == compare.
1138 * {local_address} will be updated with the previous value stored in {remote_address}.
1139 * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1140 * however, that not all btls will provide consistency between btl atomic operations and
1141 * cpu atomics.
1142 */
1143 typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl,
1144 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1145 struct mca_btl_base_registration_handle_t *local_handle,
1146 struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
1147 uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1148 void *cbcontext, void *cbdata);
1149
1150 /**
1151 * Diagnostic dump of btl state.
1152 *
1153 * @param btl (IN) BTL module
1154 * @param endpoint (IN) BTL endpoint
1155 * @param verbose (IN) Verbosity level
1156 */
1157
1158 typedef void (*mca_btl_base_module_dump_fn_t)(
1159 struct mca_btl_base_module_t* btl,
1160 struct mca_btl_base_endpoint_t* endpoint,
1161 int verbose
1162 );
1163
1164 /**
1165 * Fault Tolerance Event Notification Function
1166 * @param state Checkpoint Status
1167 * @return OPAL_SUCCESS or failure status
1168 */
1169 typedef int (*mca_btl_base_module_ft_event_fn_t)(int state);
1170
1171 /**
1172 * Flush all outstanding RDMA operations on an endpoint or all endpoints.
1173 *
1174 * @param btl (IN) BTL module
1175 * @param endpoint (IN) Endpoint to flush (NULL == all)
1176 *
1177 * This function returns when all outstanding RDMA (put, get, atomic) operations
1178 * that were started prior to the flush call have completed. This call does
1179 * NOT guarantee that all BTL callbacks have been completed.
1180 *
1181 * The BTL is allowed to ignore the endpoint parameter and flush *all* endpoints.
1182 */
1183 typedef int (*mca_btl_base_module_flush_fn_t) (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
1184
1185 /**
1186 * BTL module interface functions and attributes.
1187 */
1188 struct mca_btl_base_module_t {
1189
1190 /* BTL common attributes */
1191 mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
1192 size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */
1193 size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */
1194 size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */
1195 size_t btl_rdma_pipeline_send_length; /**< amount of bytes that should be send by pipeline protocol */
1196 size_t btl_rdma_pipeline_frag_size; /**< maximum rdma fragment size supported by the BTL */
1197 size_t btl_min_rdma_pipeline_size; /**< minimum packet size for pipeline protocol */
1198 uint32_t btl_exclusivity; /**< indicates this BTL should be used exclusively */
1199 uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
1200 uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
1201 uint32_t btl_flags; /**< flags (put/get...) */
1202 uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */
1203 size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
1204
1205 /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
1206 size_t btl_get_limit; /**< maximum size supported by the btl_get function */
1207 size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
1208 size_t btl_put_limit; /**< maximum size supported by the btl_put function */
1209 size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
1210
1211 /* minimum transaction sizes for which registration is required for local memory */
1212 size_t btl_get_local_registration_threshold;
1213 size_t btl_put_local_registration_threshold;
1214
1215 /* BTL function table */
1216 mca_btl_base_module_add_procs_fn_t btl_add_procs;
1217 mca_btl_base_module_del_procs_fn_t btl_del_procs;
1218 mca_btl_base_module_register_fn_t btl_register;
1219 mca_btl_base_module_finalize_fn_t btl_finalize;
1220
1221 mca_btl_base_module_alloc_fn_t btl_alloc;
1222 mca_btl_base_module_free_fn_t btl_free;
1223 mca_btl_base_module_prepare_fn_t btl_prepare_src;
1224 mca_btl_base_module_send_fn_t btl_send;
1225 mca_btl_base_module_sendi_fn_t btl_sendi;
1226 mca_btl_base_module_put_fn_t btl_put;
1227 mca_btl_base_module_get_fn_t btl_get;
1228 mca_btl_base_module_dump_fn_t btl_dump;
1229
1230 /* atomic operations */
1231 mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
1232 mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
1233 mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap;
1234
1235 /* new memory registration functions */
1236 mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
1237 mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
1238
1239 /** the mpool associated with this btl (optional) */
1240 mca_mpool_base_module_t* btl_mpool;
1241 /** register a default error handler */
1242 mca_btl_base_module_register_error_fn_t btl_register_error;
1243 /** fault tolerant even notification */
1244 mca_btl_base_module_ft_event_fn_t btl_ft_event;
1245 #if OPAL_CUDA_GDR_SUPPORT
1246 size_t btl_cuda_eager_limit; /**< switch from eager to RDMA */
1247 size_t btl_cuda_rdma_limit; /**< switch from RDMA to rndv pipeline */
1248 #endif /* OPAL_CUDA_GDR_SUPPORT */
1249 #if OPAL_CUDA_SUPPORT
1250 size_t btl_cuda_max_send_size; /**< set if CUDA max send_size is different from host max send size */
1251 #endif /* OPAL_CUDA_SUPPORT */
1252
1253 mca_btl_base_module_flush_fn_t btl_flush; /**< flush all previous operations on an endpoint */
1254
1255 unsigned char padding[256]; /**< padding to future-proof the btl module */
1256 };
1257 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
1258
1259 /*
1260 * Macro for use in modules that are of type btl v3.1.0
1261 */
1262 #define MCA_BTL_BASE_VERSION_3_1_0 \
1263 OPAL_MCA_BASE_VERSION_2_1_0("btl", 3, 1, 0)
1264
1265 #define MCA_BTL_DEFAULT_VERSION(name) \
1266 MCA_BTL_BASE_VERSION_3_1_0, \
1267 .mca_component_name = name, \
1268 MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, \
1269 OPAL_RELEASE_VERSION)
1270
1271 /**
1272 * Convinience macro for detecting the BTL interface version.
1273 */
1274 #define BTL_VERSION 310
1275
1276 END_C_DECLS
1277
1278 #endif /* OPAL_MCA_BTL_H */