1 /*
2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2014 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
13 * $COPYRIGHT$
14 *
15 * Additional copyrights may follow
16 *
17 * $HEADER$
18 */
19 /** @file */
20
21 #include "ompi_config.h"
22
23 #include "ompi/constants.h"
24 #include "ompi/communicator/communicator.h"
25 #include "ompi/mca/coll/coll.h"
26 #include "opal/sys/atomic.h"
27 #include "coll_sm.h"
28
29 /**
30 * Shared memory barrier.
31 *
32 * Tree-based algorithm for a barrier: a fan in to rank 0 followed by
33 * a fan out using the barrier segments in the shared memory area.
34 *
35 * There are 2 sets of barrier buffers -- since there can only be, at
36 * most, 2 outstanding barriers at any time, there is no need for more
37 * than this. The generalized in-use flags, control, and data
38 * segments are not used.
39 *
40 * The general algorithm is for a given process to wait for its N
41 * children to fan in by monitoring a uint32_t in its barrier "in"
42 * buffer. When this value reaches N (i.e., each of the children have
43 * atomically incremented the value), then the process atomically
44 * increases the uint32_t in its parent's "in" buffer. Then the
45 * process waits for the parent to set a "1" in the process' "out"
46 * buffer. Once this happens, the process writes a "1" in each of its
47 * children's "out" buffers, and returns.
48 *
49 * There's corner cases, of course, such as the root that has no
50 * parent, and the leaves that have no children. But that's the
51 * general idea.
52 */
53 int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,
54 mca_coll_base_module_t *module)
55 {
56 int rank, buffer_set;
57 mca_coll_sm_comm_t *data;
58 uint32_t i, num_children;
59 volatile uint32_t *me_in, *me_out, *children = NULL;
60 opal_atomic_uint32_t *parent;
61 int uint_control_size;
62 mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
63
64 /* Lazily enable the module the first time we invoke a collective
65 on it */
66 if (!sm_module->enabled) {
67 int ret;
68 if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
69 return ret;
70 }
71 }
72
73 uint_control_size =
74 mca_coll_sm_component.sm_control_size / sizeof(uint32_t);
75 data = sm_module->sm_comm_data;
76 rank = ompi_comm_rank(comm);
77 num_children = data->mcb_tree[rank].mcstn_num_children;
78 buffer_set = ((data->mcb_barrier_count++) % 2) * 2;
79 me_in = &data->mcb_barrier_control_me[buffer_set];
80 me_out = (uint32_t*)
81 (((char*) me_in) + mca_coll_sm_component.sm_control_size);
82
83 /* Wait for my children to write to my *in* buffer */
84
85 if (0 != num_children) {
86 /* Get children *out* buffer */
87 children = data->mcb_barrier_control_children + buffer_set +
88 uint_control_size;
89 SPIN_CONDITION(*me_in == num_children, exit_label1);
90 *me_in = 0;
91 }
92
93 /* Send to my parent and wait for a response (don't poll on
94 parent's out buffer -- that would cause a lot of network
95 traffic / contention / faults / etc. Instead, children poll on
96 local memory and therefore only num_children messages are sent
97 across the network [vs. num_children *each* time all the
98 children poll] -- i.e., the memory is only being polled by one
99 process, and it is only changed *once* by an external
100 process) */
101
102 if (0 != rank) {
103 /* Get parent *in* buffer */
104 parent = &data->mcb_barrier_control_parent[buffer_set];
105 opal_atomic_add (parent, 1);
106
107 SPIN_CONDITION(0 != *me_out, exit_label2);
108 *me_out = 0;
109 }
110
111 /* Send to my children */
112
113 for (i = 0; i < num_children; ++i) {
114 children[i * uint_control_size * 4] = 1;
115 }
116
117 /* All done! End state of the control segment:
118
119 me_in: 0
120 me_out: 0
121 */
122
123 return OMPI_SUCCESS;
124 }