1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2006 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
6 * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
7 * reserved.
8 * Copyright (c) 2017 Intel, Inc. All rights reserved
9 * $COPYRIGHT$
10 *
11 * Additional copyrights may follow
12 *
13 * $HEADER$
14 */
15
16 /**
17 * @file
18 *
19 * Matching Transport Layer
20 *
21 * The Matching Transport Layer (MTL) provides device-layer support
22 * for transfer of MPI point-to-point messages over devices that
23 * support hardware / library message matching. This layer is used
24 * with the MTL PML component to provide lowest latency and highest
25 * bandwidth on given architectures. Features found in other PML
26 * interfaces, such as message fragmenting, multi-device support, and
27 * NIC failover are not provided by the upper layers.
28 *
29 * In general, this interface should not be used for transport layer
30 * support. Instead, the BTL interface should be used. The BTL
31 * interface allows for multiplexing between multiple users
32 * (point-to-point, one-sided, etc.) and provides many features not
33 * found in this interface (RDMA from arbitrary buffers, active
34 * messaging, reasonable pinned memory caching, etc.)
35 */
36
37 #ifndef OMPI_MTL_H
38 #define OMPI_MTL_H
39
40 #include "ompi_config.h"
41 #include "mpi.h" /* needed for MPI_ANY_TAG */
42 #include "ompi/mca/mca.h"
43 #include "ompi/mca/pml/pml_constants.h" /* for send_mode enum */
44 #include "ompi/request/request.h"
45
46 BEGIN_C_DECLS
47
48 struct ompi_request_t;
49 struct opal_convertor_t;
50
51 struct mca_mtl_base_module_t;
52
53 struct mca_mtl_request_t {
54 /** pointer to associated ompi_request_t */
55 struct ompi_request_t *ompi_req;
56 void (*completion_callback)(struct mca_mtl_request_t* mtl_request);
57 };
58 typedef struct mca_mtl_request_t mca_mtl_request_t;
59
60
61 /**
62 * MTL module flags
63 */
64 #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
65 #if OPAL_CUDA_SUPPORT
66 #define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
67 #endif
68
69 /**
70 * Initialization routine for MTL component
71 *
72 * Initialization routine for MTL component. This function should
73 * allocate resources for communication and try to do all local setup.
74 * It should not attempt to contact it's peers, as that should be
75 * done at add_procs time. Contact information should be published
76 * during this initialization function. It will be made available
77 * during add_procs().
78 *
79 * @param enable_progress_threads (IN) Progress threads have been
80 * enabled by the user and the component must be
81 * capable of making asycnhronous progress (either
82 * with its own thread, with the kernel, or with
83 * the event library.
84 * @param enable_mpi_threads (IN) MPI threads have been enabled by the
85 * user and the component must be capable of coping
86 * with threads. If the component can cope with
87 * MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple
88 * should be set to true. Otherwise, it is assumed
89 * that only THREAD_FUNNELLED and THREAD_SERIALIZED
90 * can be used.
91 * @param enable_mpi_thread_multiple (OUT) Component does / does not
92 * support MPI_THREAD_MULTIPLE. This variable only
93 * needs to be set if enable_mpi_threads is true.
94 * Otherwise, the return value will be ignored.
95 *
96 * @retval NULL component can not operate on the current machine
97 * @retval non-NULL component interface function
98 */
99 typedef struct mca_mtl_base_module_t*
100 (*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads,
101 bool enable_mpi_threads);
102
103
104 struct mca_mtl_base_component_2_0_0_t {
105 mca_base_component_t mtl_version;
106 mca_base_component_data_t mtl_data;
107 mca_mtl_base_component_init_fn_t mtl_init;
108 };
109 typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t;
110 typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t;
111
112
113 /**
114 * MCA->MTL Clean up any resources held by MTL module
115 *
116 * Opposite of module_init. Called when communication will no longer
117 * be necessary. ussually this is during MPI_FINALIZE, but it can be
118 * earlier if the component was not selected to run. Assuming
119 * module_init was called, finalize will always be called before the
120 * component_close function is called.
121 *
122 * @param mtl (IN) MTL module returned from call to initialize
123 *
124 * @retval OMPI_SUCCESS cleanup finished successfully
125 * @retval other failure during cleanup
126 *
127 */
128 typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl);
129
130
131 /**
132 * PML->MTL notification of change in the process list.
133 *
134 * The mca_mtl_base_module_add_procs_fn_t() is used by the PML to
135 * notify the MTL that new processes are connected to the current
136 * process. Any addressing information exported by the peer via the
137 * ompi_modex_send() function should be available during this
138 * call via the corresponding ompi_modex_recv() function. The
139 * MTL may utilize this information to determine reachability of each
140 * peer process.
141 *
142 * It is an error for a proc to not be reachable by the given MTL, and
143 * an error should be returned if that case is detected. If a MTL
144 * requires per-endpoint data, it must handle storage, either using a
145 * static endpoint tag (MTL is the default tag that should generally
146 * be used) or a dynamic endpoint tag (although it should be noted
147 * that OMPI can be built without dynamic endpoint tag support).
148 *
149 * @param mtl (IN) MTL module
150 * @param nprocs (IN) Number of processes
151 * @param procs (IN) Set of processes
152 *
153 * @retval OMPI_SUCCESS successfully connected to processes
154 * @retval other failure during setup
155 */
156 typedef int (*mca_mtl_base_module_add_procs_fn_t)(
157 struct mca_mtl_base_module_t* mtl,
158 size_t nprocs,
159 struct ompi_proc_t** procs);
160
161
162 /**
163 * Notification of change to the process list.
164 *
165 * When the process list changes, the PML notifies the MTL of the
166 * change, to provide the opportunity to cleanup or release any
167 * resources associated with the peer. The MTL is responsible for
168 * releasing any memory associated with the endpoint data it may have
169 * stored during add_procs().
170 *
171 * @param mtl (IN) MTL module
172 * @param nprocs (IN) Number of processes
173 * @param proc (IN) Set of processes
174 * @param peer (IN) Set of peer addressing information.
175 *
176 * @return Status indicating if cleanup was successful
177 */
178 typedef int (*mca_mtl_base_module_del_procs_fn_t)(
179 struct mca_mtl_base_module_t* mtl,
180 size_t nprocs,
181 struct ompi_proc_t** procs);
182
183
184 /**
185 * Blocking send to peer
186 *
187 * Blocking send (Call should not return until the user buffer may be
188 * used again). Standard MPI semantics must be met by this call, as
189 * mandated in the mode argument. There is one special mode argument,
190 * MCA_PML_BASE_SEND_COMPLETE, which requires local completion before
191 * the function can return. This is an optimization for coillective
192 * routines that can otherwise lead to degenerate performance for
193 * broadcast-based collectives.
194 *
195 * @param comm (IN) Communicator used for operation
196 * @param dest (IN) Destination rank for send (relative to comm)
197 * @param tag (IN) MPI tag used for sending. See note below.
198 * @param convertor (IN) Datatype convertor describing send datatype.
199 * Already prepared for send.
200 * @param mode (IN) Mode for send operation
201 *
202 * @return OMPI_SUCCESS or error value
203 *
204 * \note Open MPI is built around non-blocking operations. This
205 * function is provided for networks where progressing events outside
206 * of point-to-point (for example, collectives, I/O, one-sided) can
207 * occur without a progress function regularily being triggered.
208 *
209 * \note While MPI does not allow users to specify negative tags, they
210 * are used internally in Open MPI to provide a unique channel for
211 * collective operations. Therefore, the MTL can *not* cause an error
212 * if a negative tag is used.
213 */
214 typedef int (*mca_mtl_base_module_send_fn_t)(
215 struct mca_mtl_base_module_t* mtl,
216 struct ompi_communicator_t *comm,
217 int dest,
218 int tag,
219 struct opal_convertor_t *convertor,
220 mca_pml_base_send_mode_t mode);
221
222
223 /**
224 * Non-blocking send to peer
225 *
226 * Non-blocking send to peer. Standard MPI semantics must be met by
227 * this call, as mandated in the mode argument. There is one special
228 * mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local
229 * completion before the request is marked as complete.
230 *
231 * The PML will handle creation of the request, leaving the number of
232 * bytes requested in the module structure available for the MTL
233 * directly after the ompi_request_t structure. The PML will handle
234 * proper destruction of the request once it can safely be destructed
235 * (it has been completed and freeed by a call to REQUEST_FReE or
236 * TEST/WAIT). The MTL should remove all resources associated with
237 * the request when it is marked as completed.
238 *
239 * @param comm (IN) Communicator used for operation
240 * @param dest (IN) Destination rank for send (relative to comm)
241 * @param tag (IN) MPI tag used for sending. See note below.
242 * @param convertor (IN) Datatype convertor describing send datatype.
243 * Already prepared for send.
244 * @param mode (IN) Mode for send operation (see pml.h)
245 * @param blocking (IN) True if the call originated from a blocking
246 * call, but the PML decided to use a
247 * non-blocking operation, likely for
248 * internal performance decisions This is an
249 * optimization flag and is not needed for
250 * correctness.
251 * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
252 * will be populated with an initialized
253 * ompi_request_t before calling.
254 *
255 * @return OMPI_SUCCESS or error value
256 *
257 * \note While MPI does not allow users to specify negative tags, they
258 * are used internally in Open MPI to provide a unique channel for
259 * collective operations. Therefore, the MTL can *not* cause an error
260 * if a negative tag is used.
261 */
262 typedef int (*mca_mtl_base_module_isend_fn_t)(
263 struct mca_mtl_base_module_t* mtl,
264 struct ompi_communicator_t *comm,
265 int dest,
266 int tag,
267 struct opal_convertor_t *convertor,
268 mca_pml_base_send_mode_t mode,
269 bool blocking,
270 mca_mtl_request_t *mtl_request);
271
272
273 /**
274 * Non-blocking receive
275 *
276 * Non-blocking receive function. Standard MPI semantics for
277 * MPI_Irecv must be implemented by this call.
278 *
279 * The PML will handle creation of the request, leaving the number of
280 * bytes requested in teh module structure available for the MTL,
281 * directly after the ompi_request_t structure. The PML will handle
282 * proper destruction of the request once it can safely be destroyed
283 * (it has been completed and free'ed by a call to REQUEST_FREE or
284 * TEST/WAIT). The MTL should remove all resources associated with
285 * the request when it is marked as completed.
286 *
287 * @param comm (IN) Communicator used for operation
288 * @param src (IN) Source rank for send (relative to comm)
289 * @param tag (IN) MPI tag used for sending. See note below.
290 * @param convertor (IN) Datatype convertor describing receive datatype.
291 * Already prepared for receive.
292 * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
293 * will be populated with an initialized
294 * ompi_request_t before calling.
295 *
296 * @return OMPI_SUCCESS or error value
297 *
298 * \note While MPI does not allow users to specify negative tags, they
299 * are used internally in Open MPI to provide a unique channel for
300 * collective operations. Therefore, the MTL can *not* cause an error
301 * if a negative tag is used. Further, MPI_ANY_TAG should *not* match
302 * against negative tags.
303 */
304 typedef int (*mca_mtl_base_module_irecv_fn_t)(
305 struct mca_mtl_base_module_t* mtl,
306 struct ompi_communicator_t *comm,
307 int src,
308 int tag,
309 struct opal_convertor_t *convertor,
310 struct mca_mtl_request_t *mtl_request);
311
312
313 /**
314 * Non-blocking probe
315 *
316 * Non-blocking probe function. Standard MPI semantics for MPI_IPROBE
317 * must be implemented by this call.
318 *
319 * @param comm (IN) Communicator used for operation
320 * @param src (IN) Source rank for send (relative to comm)
321 * @param tag (IN) MPI tag used for sending. See note below.
322 * @param flag (OUT) true if message available, false otherwise
323 * @param status (OUT) Status structure for information on
324 * available message
325 *
326 * \note While MPI does not allow users to specify negative tags, they
327 * are used internally in Open MPI to provide a unique channel for
328 * collective operations. Therefore, the MTL can *not* cause an error
329 * if a negative tag is used. Further, MPI_ANY_TAG should *not* match
330 * against negative tags.
331 */
332 typedef int (*mca_mtl_base_module_iprobe_fn_t)(
333 struct mca_mtl_base_module_t* mtl,
334 struct ompi_communicator_t *comm,
335 int src,
336 int tag,
337 int *flag,
338 struct ompi_status_public_t *status);
339
340
341 typedef int (*mca_mtl_base_module_imrecv_fn_t)(struct mca_mtl_base_module_t* mtl,
342 struct opal_convertor_t *convertor,
343 struct ompi_message_t **message,
344 struct mca_mtl_request_t *mtl_request);
345
346 typedef int (*mca_mtl_base_module_improbe_fn_t)(struct mca_mtl_base_module_t *mtl,
347 struct ompi_communicator_t *comm,
348 int src,
349 int tag,
350 int *matched,
351 struct ompi_message_t **message,
352 struct ompi_status_public_t *status);
353
354 /**
355 * Cancel an existing request
356 *
357 * Attempt to cancel an existing request. The (poorly defined)
358 * semantics for MPI_CANCEL must be implemented by this call. This,
359 * of course, allows the MTL module to do nothing at all.
360 * Implementations of the MTL should make a good faith effort to
361 * cancel receive requests that have not been started, as the "post a
362 * receive for control messages" paradigm is a common one in loosely
363 * coupled MPI applications.
364 *
365 * @param request(IN) Request that should be cancelled
366 * @param flag Unknown exactly what this does.
367 *
368 */
369 typedef int (*mca_mtl_base_module_cancel_fn_t)(
370 struct mca_mtl_base_module_t* mtl,
371 mca_mtl_request_t *mtl_request,
372 int flag);
373
374
375 /**
376 * Downcall from PML layer when a new communicator is created.
377 *
378 * @param comm Communicator
379 * @return OMPI_SUCCESS or failure status.
380 *
381 * Provides the MTL the opportunity to initialize/cache a data structure
382 * on the communicator.
383 */
384 typedef int (*mca_mtl_base_module_add_comm_fn_t)(
385 struct mca_mtl_base_module_t* mtl,
386 struct ompi_communicator_t* comm);
387
388
389 /**
390 * Downcall from PML layer when a communicator is destroyed.
391 *
392 * @param comm Communicator
393 * @return OMPI_SUCCESS or failure status.
394 *
395 * Provides the MTL the opportunity to cleanup any datastructures
396 * associated with the communicator.
397 */
398 typedef int (*mca_mtl_base_module_del_comm_fn_t)(
399 struct mca_mtl_base_module_t* mtl,
400 struct ompi_communicator_t* comm);
401
402
403 /**
404 * MTL module interface functions and attributes.
405 */
406 struct mca_mtl_base_module_t {
407 int mtl_max_contextid; /**< maximum allowable contextid */
408 int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */
409 size_t mtl_request_size; /**< number of bytes to reserve with request structure */
410
411 uint32_t mtl_flags; /**< flags (put/get...) */
412
413 /* MTL function table */
414 mca_mtl_base_module_add_procs_fn_t mtl_add_procs;
415 mca_mtl_base_module_del_procs_fn_t mtl_del_procs;
416 mca_mtl_base_module_finalize_fn_t mtl_finalize;
417
418 mca_mtl_base_module_send_fn_t mtl_send;
419 mca_mtl_base_module_isend_fn_t mtl_isend;
420 mca_mtl_base_module_irecv_fn_t mtl_irecv;
421 mca_mtl_base_module_iprobe_fn_t mtl_iprobe;
422 mca_mtl_base_module_imrecv_fn_t mtl_imrecv;
423 mca_mtl_base_module_improbe_fn_t mtl_improbe;
424
425 /* Optional MTL functions */
426 mca_mtl_base_module_cancel_fn_t mtl_cancel;
427 mca_mtl_base_module_add_comm_fn_t mtl_add_comm;
428 mca_mtl_base_module_del_comm_fn_t mtl_del_comm;
429 };
430 typedef struct mca_mtl_base_module_t mca_mtl_base_module_t;
431
432 /*
433 * Macro for use in modules that are of type mtl
434 */
435 #define MCA_MTL_BASE_VERSION_2_0_0 \
436 OMPI_MCA_BASE_VERSION_2_1_0("mtl", 2, 0, 0)
437
438 OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl;
439
440 /*
441 * macro for doing direct call / call through struct
442 */
443 #if MCA_ompi_mtl_DIRECT_CALL
444
445
446 #define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b
447 #define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b)
448 #define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_ompi_mtl_DIRECT_CALL_COMPONENT, a)
449
450 #include MCA_ompi_mtl_DIRECT_CALL_HEADER
451
452 #else
453 #define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a
454 #endif
455
456
457 END_C_DECLS
458 #endif