1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2009 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2006 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
15 * reserved.
16 * $COPYRIGHT$
17 *
18 * Additional copyrights may follow
19 *
20 * $HEADER$
21 */
22 /**
23 * @file
24 *
25 * BML Management Layer (BML)
26 *
27 */
28
29 #ifndef MCA_BML_H
30 #define MCA_BML_H
31
32 #include "ompi_config.h"
33 #include "ompi/mca/mca.h"
34 #include "opal/datatype/opal_convertor.h"
35 #include "opal/mca/crs/crs.h"
36 #include "opal/mca/crs/base/base.h"
37 #include "opal/mca/btl/btl.h"
38
39 #include "ompi/mca/bml/base/bml_base_btl.h"
40 #include "ompi/types.h"
41
42 #include "ompi/constants.h"
43
44 #define OPAL_ENABLE_DEBUG_RELIABILITY 0
45
46 /*
47 * BML types
48 */
49
50 struct ompi_proc_t;
51 struct mca_bml_base_module_t;
52 struct mca_bml_base_endpoint_t;
53 struct mca_mpool_base_resources_t;
54
55 /*
56 * Cached set of information for each btl
57 */
58
59 struct mca_bml_base_btl_t {
60 uint32_t btl_flags; /**< support for put/get? */
61 float btl_weight; /**< BTL weight for scheduling */
62 struct mca_btl_base_module_t *btl; /**< BTL module */
63 struct mca_btl_base_endpoint_t* btl_endpoint; /**< BTL addressing info */
64 };
65 typedef struct mca_bml_base_btl_t mca_bml_base_btl_t;
66
67
68
69 /**
70 * A dynamically growable array of mca_bml_base_btl_t instances.
71 * Maintains an index into the array that is used for round-robin
72 * scheduling across contents.
73 */
74 struct mca_bml_base_btl_array_t {
75 opal_object_t super;
76 size_t arr_size; /**< number available */
77 size_t arr_reserve; /**< size of allocated btl_proc array */
78 size_t arr_index; /**< last used index*/
79 mca_bml_base_btl_t* bml_btls; /**< array of bml btl's */
80 };
81 typedef struct mca_bml_base_btl_array_t mca_bml_base_btl_array_t;
82
83 OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bml_base_btl_array_t);
84
85
86 /**
87 * If required, reallocate (grow) the array to the indicate size.
88 *
89 * @param array (IN)
90 * @param size (IN)
91 */
92 static inline size_t mca_bml_base_btl_array_get_size(mca_bml_base_btl_array_t* array)
93 {
94 return array->arr_size;
95 }
96
97 /**
98 * Grow the array if required, and set the size.
99 *
100 * @param array (IN)
101 * @param size (IN)
102 */
103 static inline void mca_bml_base_btl_array_set_size(mca_bml_base_btl_array_t* array, size_t size)
104 {
105 if(array->arr_size > array->arr_reserve)
106 mca_bml_base_btl_array_reserve(array, size);
107 array->arr_size = size;
108 }
109
110 /**
111 * Grow the array size by one and return the item at that index.
112 *
113 * @param array (IN)
114 */
115 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_insert(mca_bml_base_btl_array_t* array)
116 {
117 #if OPAL_ENABLE_DEBUG
118 if(array->arr_size >= array->arr_reserve) {
119 opal_output(0, "mca_bml_base_btl_array_insert: invalid array index %lu >= %lu",
120 (unsigned long)array->arr_size, (unsigned long)array->arr_reserve);
121 return 0;
122 }
123 #endif
124 return &array->bml_btls[array->arr_size++];
125 }
126
127 /**
128 * Remove a btl from a bml_btl
129 *
130 * @param array (IN)
131 * @param btl (IN)
132 */
133 static inline bool mca_bml_base_btl_array_remove( mca_bml_base_btl_array_t* array,
134 struct mca_btl_base_module_t* btl )
135 {
136 size_t i = 0;
137 /* find the btl */
138 for( i = 0; i < array->arr_size; i++ ) {
139 if( array->bml_btls[i].btl == btl ) {
140 /* make sure not to go out of bounds */
141 for( ; i < array->arr_size-1; i++ ) {
142 /* move all btl's back by 1, so the found
143 btl is "removed" */
144 array->bml_btls[i] = array->bml_btls[(i+1)];
145 }
146 array->arr_size--;
147 array->arr_index = 0;
148 return true;
149 }
150 }
151 return false;
152 }
153
154
155 /**
156 * Return an array item at the specified index.
157 *
158 * @param array (IN)
159 * @param item_index (IN)
160 */
161 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t* array, size_t item_index)
162 {
163 if (item_index < array->arr_size) {
164 return &array->bml_btls[item_index];
165 }
166
167 return NULL;
168 }
169
170 /**
171 * Return the next LRU index in the array.
172 *
173 * @param array (IN)
174 *
175 * @param index (OUT)
176 */
177 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t* array)
178 {
179 #if OPAL_ENABLE_DEBUG
180 if(array->arr_size == 0) {
181 opal_output(0, "mca_bml_base_btl_array_get_next: invalid array size");
182 return 0;
183 }
184 #endif
185 if( 1 == array->arr_size ) {
186 return &array->bml_btls[0]; /* force the return to avoid a jump */
187 } else {
188 size_t current_position = array->arr_index; /* force to always start from zero */
189 if( (current_position + 1) == array->arr_size ) {
190 array->arr_index = 0; /* next time serve from the beginning */
191 } else {
192 array->arr_index = current_position + 1; /* continue */
193 }
194 return &array->bml_btls[current_position];
195 }
196 }
197
198 /**
199 * Locate an element in the array
200 *
201 * @param array (IN)
202 * @param index (IN)
203 */
204 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_find(
205 mca_bml_base_btl_array_t* array, struct mca_btl_base_module_t* btl)
206 {
207 size_t i=0;
208 for(i=0; i<array->arr_size; i++) {
209 if(array->bml_btls[i].btl == btl) {
210 return &array->bml_btls[i];
211 }
212 }
213 return NULL;
214 }
215
216 /**
217 * Structure associated w/ ompi_proc_t that contains the set
218 * of BTLs used to reach a destination
219 */
220 struct mca_bml_base_endpoint_t {
221 opal_list_item_t super; /**< base_endpoint is a list item */
222 struct ompi_proc_t* btl_proc; /**< backpointer to target ompi_proc_t */
223 size_t btl_pipeline_send_length; /**< max of pipeline send_length of available BTLs */
224 size_t btl_send_limit; /**< max of min rdma pipeline for available rmda btls */
225 size_t btl_max_send_size; /**< min of max send size for available send btls */
226 mca_bml_base_btl_array_t btl_eager; /**< array of btls to use for first fragments */
227 mca_bml_base_btl_array_t btl_send; /**< array of btls to use for remaining fragments */
228 mca_bml_base_btl_array_t btl_rdma; /**< array of btls that support (prefer) rdma */
229 size_t btl_rdma_index; /**< index of last used BTL for RDMA */
230 uint32_t btl_flags_or; /**< the bitwise OR of the btl flags */
231 };
232 typedef struct mca_bml_base_endpoint_t mca_bml_base_endpoint_t;
233
234
235 OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bml_base_endpoint_t);
236
237 static inline void mca_bml_base_alloc( mca_bml_base_btl_t* bml_btl,
238 mca_btl_base_descriptor_t** des,
239 uint8_t order, size_t size, uint32_t flags )
240 {
241 mca_btl_base_module_t* btl = bml_btl->btl;
242 *des = btl->btl_alloc(btl, bml_btl->btl_endpoint, order, size, flags);
243 }
244
245 static inline void mca_bml_base_free( mca_bml_base_btl_t* bml_btl,
246 mca_btl_base_descriptor_t* des )
247 {
248 mca_btl_base_module_t* btl = bml_btl->btl;
249
250 btl->btl_free( btl, des );
251 /* The previous function is supposed to release the des object
252 * so we should not touch it anymore.
253 */
254 }
255
256 #if OPAL_ENABLE_DEBUG_RELIABILITY
257
258 int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
259 mca_btl_base_descriptor_t* des,
260 mca_btl_base_tag_t tag );
261
262
263 #else
264
265 static inline int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
266 mca_btl_base_descriptor_t* des,
267 mca_btl_base_tag_t tag )
268 {
269 int rc;
270 mca_btl_base_module_t* btl = bml_btl->btl;
271
272 des->des_context = (void*) bml_btl;
273 rc = btl->btl_send(btl, bml_btl->btl_endpoint, des, tag);
274 if (rc == OMPI_ERR_RESOURCE_BUSY)
275 rc = OMPI_SUCCESS;
276
277 return rc;
278 }
279
280 #endif
281
282 static inline int mca_bml_base_send_status( mca_bml_base_btl_t* bml_btl,
283 mca_btl_base_descriptor_t* des,
284 mca_btl_base_tag_t tag )
285 {
286 mca_btl_base_module_t* btl = bml_btl->btl;
287
288 des->des_context = (void*) bml_btl;
289 return btl->btl_send(btl, bml_btl->btl_endpoint, des, tag);
290 }
291
292 static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
293 struct opal_convertor_t* convertor,
294 void* header,
295 size_t header_size,
296 size_t payload_size,
297 uint8_t order,
298 uint32_t flags,
299 mca_btl_base_tag_t tag,
300 mca_btl_base_descriptor_t** descriptor )
301 {
302 mca_btl_base_module_t* btl = bml_btl->btl;
303 return btl->btl_sendi(btl, bml_btl->btl_endpoint,
304 convertor, header, header_size,
305 payload_size, order, flags, tag, descriptor);
306 }
307
308 static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
309 struct mca_btl_base_registration_handle_t *local_handle,
310 struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
311 int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
312 {
313 mca_btl_base_module_t* btl = bml_btl->btl;
314
315 return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
316 remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
317 }
318
319 static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
320 struct mca_btl_base_registration_handle_t *local_handle,
321 struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
322 int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
323 {
324 mca_btl_base_module_t* btl = bml_btl->btl;
325
326 return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
327 remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
328 }
329
330
331 static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
332 struct opal_convertor_t* conv,
333 uint8_t order,
334 size_t reserve,
335 size_t *size,
336 uint32_t flags,
337 mca_btl_base_descriptor_t** des)
338 {
339 mca_btl_base_module_t* btl = bml_btl->btl;
340
341 *des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv,
342 order, reserve, size, flags );
343 if( OPAL_LIKELY((*des) != NULL) ) {
344 (*des)->des_context = (void*) bml_btl;
345 }
346 }
347
348 static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base,
349 size_t size, uint32_t flags,
350 mca_btl_base_registration_handle_t **handle)
351 {
352 mca_btl_base_module_t* btl = bml_btl->btl;
353
354 *handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags);
355 }
356
357 static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle)
358 {
359 mca_btl_base_module_t* btl = bml_btl->btl;
360
361 btl->btl_deregister_mem (btl, handle);
362 }
363
364 /*
365 * BML component interface functions and datatype.
366 */
367
368 /**
369 * MCA->BML Initializes the BML component and creates specific BML
370 * module(s).
371 *
372 * @param num_bmls (OUT) Returns the number of bml modules created, or 0
373 * if the transport is not available.
374 *
375 * @param enable_progress_threads (IN) Whether this component is
376 * allowed to run a hidden/progress thread or not.
377 *
378 * @param enable_mpi_threads (IN) Whether support for multiple MPI
379 * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
380 * indicates whether multiple threads may invoke this component
381 * simultaneously or not.
382 *
383 * @return Array of pointers to BML modules, or NULL if the transport
384 * is not available.
385 *
386 * During component initialization, the BML component should discover
387 * the physical devices that are available for the given transport,
388 * and create a BML module to represent each device. Any addressing
389 * information required by peers to reach the device should be published
390 * during this function via the mca_base_modex_send() interface.
391 *
392 */
393
394 typedef struct mca_bml_base_module_t* (*mca_bml_base_component_init_fn_t)(
395 int* priority,
396 bool enable_progress_threads,
397 bool enable_mpi_threads
398 );
399
400 /**
401 * BML component descriptor. Contains component version information
402 * and component open/close/init functions.
403 */
404
405 struct mca_bml_base_component_2_0_0_t {
406 mca_base_component_t bml_version;
407 mca_base_component_data_t bml_data;
408 mca_bml_base_component_init_fn_t bml_init;
409 };
410 typedef struct mca_bml_base_component_2_0_0_t mca_bml_base_component_2_0_0_t;
411 typedef struct mca_bml_base_component_2_0_0_t mca_bml_base_component_t;
412
413
414 /*
415 * BML module interface functions and datatype.
416 */
417
418 /**
419 * MCA->BML Clean up any resources held by BML module
420 * before the module is unloaded.
421 *
422 * @param bml (IN) BML module.
423 *
424 * Prior to unloading a BML module, the MCA framework will call
425 * the BML finalize method of the module. Any resources held by
426 * the BML should be released and if required the memory corresponding
427 * to the BML module freed.
428 *
429 */
430 typedef int (*mca_bml_base_module_finalize_fn_t)( void );
431
432 /**
433 * PML->BML notification of change in the process list.
434 *
435 * @param nprocs (IN) Number of processes
436 * @param procs (IN) Set of processes
437 * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BML.
438 * @return OMPI_SUCCESS or error status on failure.
439 *
440 * The mca_bml_base_module_add_procs_fn_t() is called by the PML to
441 * determine the set of BTLs that should be used to reach each process.
442 * Any addressing information exported by the peer via the mca_base_modex_send()
443 * function should be available during this call via the corresponding
444 * mca_base_modex_recv() function. The BML may utilize this information to
445 * determine reachability of each peer process.
446 *
447 * For each process that is reachable by the BML, the bit corresponding to the index
448 * into the proc array (nprocs) should be set in the reachable bitmask. The PML
449 * provides the BML the option to return a pointer to a data structure defined
450 * by the BML that is returned to the BML on subsequent calls to the BML data
451 * transfer functions (e.g bml_send). This may be used by the BML to cache any addressing
452 * or connection information (e.g. TCP socket, IP queue pair).
453 *
454 * \note This function will return OMPI_ERR_UNREACH if one or more
455 * processes can not be reached by the currently active BTLs. This is
456 * not a fatal error, and the calling layer is free to continue using
457 * the BML interface.
458 */
459 typedef int (*mca_bml_base_module_add_procs_fn_t)(
460 size_t nprocs,
461 struct ompi_proc_t** procs,
462 struct opal_bitmap_t* reachable
463 );
464
465 /**
466 * PML->BML notification of change in the process list.
467 *
468 * @param proc (IN) Process
469 * @return OMPI_SUCCESS or error status on failure.
470 *
471 * The mca_bml_base_module_add_proc_fn_t() is called by the PML to
472 * determine the set of BTLs that should be used to reach each process.
473 * Any addressing information exported by the peer via the mca_base_modex_send()
474 * function should be available during this call via the corresponding
475 * mca_base_modex_recv() function. The BML may utilize this information to
476 * determine reachability of each peer process.
477 *
478 * \note This function will return OMPI_ERR_UNREACH if the process can not
479 * be reached by a currently active BTL. This is not a fatal error, and the
480 * calling layer is free to continue using the BML interface.
481 */
482 typedef int (*mca_bml_base_module_add_proc_fn_t) (struct ompi_proc_t *proc);
483
484 /**
485 * Notification of change to the process list.
486 *
487 * @param nprocs (IN) Number of processes
488 * @param proc (IN) Set of processes
489 * @return Status indicating if cleanup was successful
490 *
491 * When the process list changes, the PML notifies the BML of the
492 * change, to provide the opportunity to cleanup or release any
493 * resources associated with the peer.
494 */
495 typedef int (*mca_bml_base_module_del_procs_fn_t)(
496 size_t nprocs,
497 struct ompi_proc_t** procs
498 );
499
500 /**
501 * Notification of change to the btl list.
502 *
503 * @param bml (IN) BTL module
504 * @return Status indicating if cleanup was successful
505 *
506 * On recovery of a btl, add it to the set of forwarding
507 * entries used by the BML.
508 */
509 typedef int (*mca_bml_base_module_add_btl_fn_t)( struct mca_btl_base_module_t* );
510
511 /**
512 * Notification of change to the btl list.
513 *
514 * @param bml (IN) BTL module
515 * @return Status indicating if cleanup was successful
516 *
517 * On failure of a btl, remove it from the set of forwarding
518 * entries used by the BML.
519 */
520 typedef int (*mca_bml_base_module_del_btl_fn_t)( struct mca_btl_base_module_t* );
521
522 /**
523 * Notification of change to the btl list.
524 *
525 * @param bml (IN) BTL module
526 * @return Status indicating if cleanup was successful
527 *
528 * On failure of a btl, remove it from the set of forwarding
529 * entries used by the BML.
530 */
531 typedef int (*mca_bml_base_module_del_proc_btl_fn_t)(
532 struct ompi_proc_t*,
533 struct mca_btl_base_module_t* );
534
535 /**
536 * Register a callback function that is called on receipt
537 * of a fragment.
538 *
539 * @param bml (IN) BML module
540 * @return Status indicating if cleanup was successful
541 *
542 * When the process list changes, the PML notifies the BML of the
543 * change, to provide the opportunity to cleanup or release any
544 * resources associated with the peer.
545 */
546 typedef int (*mca_bml_base_module_register_fn_t)(
547 mca_btl_base_tag_t tag,
548 mca_btl_base_module_recv_cb_fn_t cbfunc,
549 void* cbdata
550 );
551 /**
552 * Register a callback function that is called of error.
553 *
554 * @param bml (IN) BML module
555 * @return Status indicating if cleanup was successful
556 *
557 */
558 typedef int (*mca_bml_base_module_register_error_cb_fn_t)(
559 mca_btl_base_module_error_cb_fn_t cbfunc
560 );
561
562 /**
563 * Fault Tolerance Event Notification Function
564 * @param status Checkpoint Status
565 * @return OMPI_SUCCESS or failure status
566 */
567 typedef int (*mca_bml_base_module_ft_event_fn_t)(int status);
568
569
570 /**
571 * BML module interface functions and attributes.
572 */
573 struct mca_bml_base_module_t {
574 /* BML common attributes */
575 mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */
576
577 /* BML function table */
578 mca_bml_base_module_add_proc_fn_t bml_add_proc;
579 mca_bml_base_module_add_procs_fn_t bml_add_procs;
580 mca_bml_base_module_del_procs_fn_t bml_del_procs;
581 mca_bml_base_module_add_btl_fn_t bml_add_btl;
582 mca_bml_base_module_del_btl_fn_t bml_del_btl;
583 mca_bml_base_module_del_proc_btl_fn_t bml_del_proc_btl;
584 mca_bml_base_module_register_fn_t bml_register;
585 mca_bml_base_module_register_error_cb_fn_t bml_register_error;
586
587 mca_bml_base_module_finalize_fn_t bml_finalize;
588
589 mca_bml_base_module_ft_event_fn_t bml_ft_event;
590 };
591 typedef struct mca_bml_base_module_t mca_bml_base_module_t;
592
593 /*
594 * Macro for use in modules that are of type bml
595 */
596 #define MCA_BML_BASE_VERSION_2_0_0 \
597 OMPI_MCA_BASE_VERSION_2_1_0("bml", 2, 0, 0)
598
599 #endif /* OMPI_MCA_BML_H */