1 /*
2 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2014 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
13 * Copyright (c) 2015 Research Organization for Information Science
14 * and Technology (RIST). All rights reserved.
15 * $COPYRIGHT$
16 *
17 * Additional copyrights may follow
18 *
19 * $HEADER$
20 */
21 /** @file */
22
23 #ifndef MCA_COLL_SM_EXPORT_H
24 #define MCA_COLL_SM_EXPORT_H
25
26 #include "ompi_config.h"
27
28 #include "mpi.h"
29 #include "ompi/mca/mca.h"
30 #include "opal/datatype/opal_convertor.h"
31 #include "opal/mca/common/sm/common_sm.h"
32 #include "ompi/mca/coll/coll.h"
33
34 BEGIN_C_DECLS
35
36 /* Attempt to give some sort of progress / fairness if we're blocked
37 in an sm collective for a long time: call opal_progress once in a
38 great while. Use a "goto" label for expdiency to exit loops. */
39 #define SPIN_CONDITION_MAX 100000
40 #define SPIN_CONDITION(cond, exit_label) \
41 do { int i; \
42 if (cond) goto exit_label; \
43 for (i = 0; i < SPIN_CONDITION_MAX; ++i) { \
44 if (cond) { goto exit_label; } \
45 } \
46 opal_progress(); \
47 } while (1); \
48 exit_label:
49
50 /**
51 * Structure to hold the sm coll component. First it holds the
52 * base coll component, and then holds a bunch of
53 * sm-coll-component-specific stuff (e.g., current MCA param
54 * values).
55 */
56 typedef struct mca_coll_sm_component_t {
57 /** Base coll component */
58 mca_coll_base_component_2_0_0_t super;
59
60 /** MCA parameter: Priority of this component */
61 int sm_priority;
62
63 /** MCA parameter: Length of a cache line or page (in bytes) */
64 int sm_control_size;
65
66 /** MCA parameter: Number of "in use" flags in each
67 communicator's area in the data mpool */
68 int sm_comm_num_in_use_flags;
69
70 /** MCA parameter: Number of segments for each communicator in
71 the data mpool */
72 int sm_comm_num_segments;
73
74 /** MCA parameter: Fragment size for data */
75 int sm_fragment_size;
76
77 /** MCA parameter: Degree of tree for tree-based collectives */
78 int sm_tree_degree;
79
80 /** MCA parameter: Number of processes to use in the
81 calculation of the "info" MCA parameter */
82 int sm_info_comm_size;
83
84 /******* end of MCA params ********/
85
86 /** How many fragment segments are protected by a single
87 in-use flags. This is solely so that we can only perform
88 the division once and then just use the value without
89 having to re-calculate. */
90 int sm_segs_per_inuse_flag;
91 } mca_coll_sm_component_t;
92
93 /**
94 * Structure for representing a node in the tree
95 */
96 typedef struct mca_coll_sm_tree_node_t {
97 /** Arbitrary ID number, starting from 0 */
98 int mcstn_id;
99 /** Pointer to parent, or NULL if root */
100 struct mca_coll_sm_tree_node_t *mcstn_parent;
101 /** Number of children, or 0 if a leaf */
102 int mcstn_num_children;
103 /** Pointer to an array of children, or NULL if 0 ==
104 mcstn_num_children */
105 struct mca_coll_sm_tree_node_t **mcstn_children;
106 } mca_coll_sm_tree_node_t;
107
108 /**
109 * Simple structure comprising the "in use" flags. Contains two
110 * members: the number of processes that are currently using this
111 * set of segments and the operation number of the current
112 * operation.
113 */
114 typedef struct mca_coll_sm_in_use_flag_t {
115 /** Number of processes currently using this set of
116 segments */
117 opal_atomic_uint32_t mcsiuf_num_procs_using;
118 /** Must match data->mcb_count */
119 volatile uint32_t mcsiuf_operation_count;
120 } mca_coll_sm_in_use_flag_t;
121
122 /**
123 * Structure containing pointers to various arrays of data in the
124 * per-communicator shmem data segment (one of these indexes a
125 * single segment in the per-communicator shmem data segment).
126 * Nothing is hard-coded because all the array lengths and
127 * displacements of the pointers all depend on how many processes
128 * are in the communicator.
129 */
130 typedef struct mca_coll_sm_data_index_t {
131 /** Pointer to beginning of control data */
132 uint32_t volatile *mcbmi_control;
133 /** Pointer to beginning of message fragment data */
134 char *mcbmi_data;
135 } mca_coll_sm_data_index_t;
136
137 /**
138 * Structure for the sm coll module to hang off the communicator.
139 * Contains communicator-specific information, including pointers
140 * into the per-communicator shmem data data segment for this
141 * comm's sm collective operations area.
142 */
143 typedef struct mca_coll_sm_comm_t {
144 /* Meta data that we get back from the common mmap allocation
145 function */
146 mca_common_sm_module_t *sm_bootstrap_meta;
147
148 /** Pointer to my barrier control pages (odd index pages are
149 "in", even index pages are "out") */
150 uint32_t *mcb_barrier_control_me;
151
152 /** Pointer to my parent's barrier control pages (will be NULL
153 for communicator rank 0; odd index pages are "in", even
154 index pages are "out") */
155 opal_atomic_uint32_t *mcb_barrier_control_parent;
156
157 /** Pointers to my childrens' barrier control pages (they're
158 contiguous in memory, so we only point to the base -- the
159 number of children is in my entry in the mcb_tree); will
160 be NULL if this process has no children (odd index pages
161 are "in", even index pages are "out") */
162 uint32_t *mcb_barrier_control_children;
163
164 /** Number of barriers that we have executed (i.e., which set
165 of barrier buffers to use). */
166 int mcb_barrier_count;
167
168 /** "In use" flags indicating which segments are available */
169 mca_coll_sm_in_use_flag_t *mcb_in_use_flags;
170
171 /** Array of indexes into the per-communicator shmem data
172 segment for control and data fragment passing (containing
173 pointers to each segments control and data areas). */
174 mca_coll_sm_data_index_t *mcb_data_index;
175
176 /** Array of graph nodes representing the tree used for
177 communications */
178 mca_coll_sm_tree_node_t *mcb_tree;
179
180 /** Operation number (i.e., which segment number to use) */
181 uint32_t mcb_operation_count;
182 } mca_coll_sm_comm_t;
183
184 /** Coll sm module */
185 typedef struct mca_coll_sm_module_t {
186 /** Base module */
187 mca_coll_base_module_t super;
188
189 /* Whether this module has been lazily initialized or not yet */
190 bool enabled;
191
192 /* Data that hangs off the communicator */
193 mca_coll_sm_comm_t *sm_comm_data;
194
195 /* Underlying reduce function and module */
196 mca_coll_base_module_reduce_fn_t previous_reduce;
197 mca_coll_base_module_t *previous_reduce_module;
198 } mca_coll_sm_module_t;
199 OBJ_CLASS_DECLARATION(mca_coll_sm_module_t);
200
201 /**
202 * Global component instance
203 */
204 OMPI_MODULE_DECLSPEC extern mca_coll_sm_component_t mca_coll_sm_component;
205
206 /*
207 * coll module functions
208 */
209 int mca_coll_sm_init_query(bool enable_progress_threads,
210 bool enable_mpi_threads);
211
212 mca_coll_base_module_t *
213 mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority);
214
215 /* Lazily enable a module (since it involves expensive/slow mmap
216 allocation, etc.) */
217 int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
218 struct ompi_communicator_t *comm);
219
220 int mca_coll_sm_allgather_intra(const void *sbuf, int scount,
221 struct ompi_datatype_t *sdtype,
222 void *rbuf, int rcount,
223 struct ompi_datatype_t *rdtype,
224 struct ompi_communicator_t *comm,
225 mca_coll_base_module_t *module);
226
227 int mca_coll_sm_allgatherv_intra(const void *sbuf, int scount,
228 struct ompi_datatype_t *sdtype,
229 void * rbuf, const int *rcounts, const int *disps,
230 struct ompi_datatype_t *rdtype,
231 struct ompi_communicator_t *comm,
232 mca_coll_base_module_t *module);
233 int mca_coll_sm_allreduce_intra(const void *sbuf, void *rbuf, int count,
234 struct ompi_datatype_t *dtype,
235 struct ompi_op_t *op,
236 struct ompi_communicator_t *comm,
237 mca_coll_base_module_t *module);
238 int mca_coll_sm_alltoall_intra(const void *sbuf, int scount,
239 struct ompi_datatype_t *sdtype,
240 void* rbuf, int rcount,
241 struct ompi_datatype_t *rdtype,
242 struct ompi_communicator_t *comm,
243 mca_coll_base_module_t *module);
244 int mca_coll_sm_alltoallv_intra(const void *sbuf, const int *scounts, const int *sdisps,
245 struct ompi_datatype_t *sdtype,
246 void *rbuf, const int *rcounts, const int *rdisps,
247 struct ompi_datatype_t *rdtype,
248 struct ompi_communicator_t *comm,
249 mca_coll_base_module_t *module);
250 int mca_coll_sm_alltoallw_intra(const void *sbuf, const int *scounts, const int *sdisps,
251 struct ompi_datatype_t * const *sdtypes,
252 void *rbuf, const int *rcounts, const int *rdisps,
253 struct ompi_datatype_t * const *rdtypes,
254 struct ompi_communicator_t *comm,
255 mca_coll_base_module_t *module);
256 int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,
257 mca_coll_base_module_t *module);
258 int mca_coll_sm_bcast_intra(void *buff, int count,
259 struct ompi_datatype_t *datatype,
260 int root,
261 struct ompi_communicator_t *comm,
262 mca_coll_base_module_t *module);
263 int mca_coll_sm_bcast_log_intra(void *buff, int count,
264 struct ompi_datatype_t *datatype,
265 int root,
266 struct ompi_communicator_t *comm,
267 mca_coll_base_module_t *module);
268 int mca_coll_sm_exscan_intra(const void *sbuf, void *rbuf, int count,
269 struct ompi_datatype_t *dtype,
270 struct ompi_op_t *op,
271 struct ompi_communicator_t *comm,
272 mca_coll_base_module_t *module);
273 int mca_coll_sm_gather_intra(void *sbuf, int scount,
274 struct ompi_datatype_t *sdtype, void *rbuf,
275 int rcount, struct ompi_datatype_t *rdtype,
276 int root, struct ompi_communicator_t *comm,
277 mca_coll_base_module_t *module);
278 int mca_coll_sm_gatherv_intra(void *sbuf, int scount,
279 struct ompi_datatype_t *sdtype, void *rbuf,
280 int *rcounts, int *disps,
281 struct ompi_datatype_t *rdtype, int root,
282 struct ompi_communicator_t *comm,
283 mca_coll_base_module_t *module);
284 int mca_coll_sm_reduce_intra(const void *sbuf, void* rbuf, int count,
285 struct ompi_datatype_t *dtype,
286 struct ompi_op_t *op,
287 int root,
288 struct ompi_communicator_t *comm,
289 mca_coll_base_module_t *module);
290 int mca_coll_sm_reduce_log_intra(const void *sbuf, void* rbuf, int count,
291 struct ompi_datatype_t *dtype,
292 struct ompi_op_t *op,
293 int root,
294 struct ompi_communicator_t *comm,
295 mca_coll_base_module_t *module);
296 int mca_coll_sm_reduce_scatter_intra(const void *sbuf, void *rbuf,
297 int *rcounts,
298 struct ompi_datatype_t *dtype,
299 struct ompi_op_t *op,
300 struct ompi_communicator_t *comm,
301 mca_coll_base_module_t *module);
302 int mca_coll_sm_scan_intra(const void *sbuf, void *rbuf, int count,
303 struct ompi_datatype_t *dtype,
304 struct ompi_op_t *op,
305 struct ompi_communicator_t *comm,
306 mca_coll_base_module_t *module);
307 int mca_coll_sm_scatter_intra(const void *sbuf, int scount,
308 struct ompi_datatype_t *sdtype, void *rbuf,
309 int rcount, struct ompi_datatype_t *rdtype,
310 int root, struct ompi_communicator_t *comm,
311 mca_coll_base_module_t *module);
312 int mca_coll_sm_scatterv_intra(const void *sbuf, const int *scounts, const int *disps,
313 struct ompi_datatype_t *sdtype,
314 void* rbuf, int rcount,
315 struct ompi_datatype_t *rdtype, int root,
316 struct ompi_communicator_t *comm,
317 mca_coll_base_module_t *module);
318
319 int mca_coll_sm_ft_event(int state);
320
321 /**
322 * Global variables used in the macros (essentially constants, so
323 * these are thread safe)
324 */
325 extern uint32_t mca_coll_sm_one;
326
327
328 /**
329 * Macro to setup flag usage
330 */
331 #define FLAG_SETUP(flag_num, flag, data) \
332 (flag) = (mca_coll_sm_in_use_flag_t*) \
333 (((char *) (data)->mcb_in_use_flags) + \
334 ((flag_num) * mca_coll_sm_component.sm_control_size))
335
336 /**
337 * Macro to wait for the in-use flag to become idle (used by the root)
338 */
339 #define FLAG_WAIT_FOR_IDLE(flag, label) \
340 SPIN_CONDITION(0 == (flag)->mcsiuf_num_procs_using, label)
341
342 /**
343 * Macro to wait for a flag to indicate that it's ready for this
344 * operation (used by non-root processes to know when FLAG_SET() has
345 * been called)
346 */
347 #define FLAG_WAIT_FOR_OP(flag, op, label) \
348 SPIN_CONDITION((op) == flag->mcsiuf_operation_count, label)
349
350 /**
351 * Macro to set an in-use flag with relevant data to claim it
352 */
353 #define FLAG_RETAIN(flag, num_procs, op_count) \
354 (flag)->mcsiuf_num_procs_using = (num_procs); \
355 (flag)->mcsiuf_operation_count = (op_count)
356
357 /**
358 * Macro to release an in-use flag from this process
359 */
360 #define FLAG_RELEASE(flag) \
361 opal_atomic_add(&(flag)->mcsiuf_num_procs_using, -1)
362
363 /**
364 * Macro to copy a single segment in from a user buffer to a shared
365 * segment
366 */
367 #define COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data) \
368 (iov).iov_base = \
369 (index)->mcbmi_data + \
370 ((rank) * mca_coll_sm_component.sm_fragment_size); \
371 (iov).iov_len = (max_data); \
372 opal_convertor_pack(&(convertor), &(iov), &mca_coll_sm_one, \
373 &(max_data) )
374
375 /**
376 * Macro to copy a single segment out from a shared segment to a user
377 * buffer
378 */
379 #define COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data) \
380 (iov).iov_base = (((char*) (index)->mcbmi_data) + \
381 ((src_rank) * (mca_coll_sm_component.sm_fragment_size))); \
382 (iov).iov_len = (max_data); \
383 opal_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_one, \
384 &(max_data) )
385
386 /**
387 * Macro to memcpy a fragment between one shared segment and another
388 */
389 #define COPY_FRAGMENT_BETWEEN(src_rank, dest_rank, index, len) \
390 memcpy(((index)->mcbmi_data + \
391 ((dest_rank) * mca_coll_sm_component.sm_fragment_size)), \
392 ((index)->mcbmi_data + \
393 ((src_rank) * \
394 mca_coll_sm_component.sm_fragment_size)), \
395 (len))
396
397 /**
398 * Macro to tell children that a segment is ready (normalize
399 * the child's ID based on the shift used to calculate the "me" node
400 * in the tree). Used in fan out opertations.
401 */
402 #define PARENT_NOTIFY_CHILDREN(children, num_children, index, value) \
403 do { \
404 for (i = 0; i < (num_children); ++i) { \
405 *((size_t*) \
406 (((char*) index->mcbmi_control) + \
407 (mca_coll_sm_component.sm_control_size * \
408 (((children)[i]->mcstn_id + root) % size)))) = (value); \
409 } \
410 } while (0)
411
412 /**
413 * Macro for childen to wait for parent notification (use real rank).
414 * Save the value passed and then reset it when done. Used in fan out
415 * operations.
416 */
417 #define CHILD_WAIT_FOR_NOTIFY(rank, index, value, label) \
418 do { \
419 uint32_t volatile *ptr = ((uint32_t*) \
420 (((char*) index->mcbmi_control) + \
421 ((rank) * mca_coll_sm_component.sm_control_size))); \
422 SPIN_CONDITION(0 != *ptr, label); \
423 (value) = *ptr; \
424 *ptr = 0; \
425 } while (0)
426
427 /**
428 * Macro for children to tell parent that the data is ready in their
429 * segment. Used for fan in operations.
430 */
431 #define CHILD_NOTIFY_PARENT(child_rank, parent_rank, index, value) \
432 ((size_t volatile *) \
433 (((char*) (index)->mcbmi_control) + \
434 (mca_coll_sm_component.sm_control_size * \
435 (parent_rank))))[(child_rank)] = (value)
436
437 /**
438 * Macro for parent to wait for a specific child to tell it that the
439 * data is in the child's segment. Save the value when done. Used
440 * for fan in operations.
441 */
442 #define PARENT_WAIT_FOR_NOTIFY_SPECIFIC(child_rank, parent_rank, index, value, label) \
443 do { \
444 size_t volatile *ptr = ((size_t volatile *) \
445 (((char*) index->mcbmi_control) + \
446 (mca_coll_sm_component.sm_control_size * \
447 (parent_rank)))) + child_rank; \
448 SPIN_CONDITION(0 != *ptr, label); \
449 (value) = *ptr; \
450 *ptr = 0; \
451 } while (0)
452
453 END_C_DECLS
454
455 #endif /* MCA_COLL_SM_EXPORT_H */