1 /* 2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana 3 * University Research and Technology 4 * Corporation. All rights reserved. 5 * Copyright (c) 2004-2014 The University of Tennessee and The University 6 * of Tennessee Research Foundation. All rights 7 * reserved. 8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 9 * University of Stuttgart. All rights reserved. 10 * Copyright (c) 2004-2005 The Regents of the University of California. 11 * All rights reserved. 12 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. 13 * $COPYRIGHT$ 14 * 15 * Additional copyrights may follow 16 * 17 * $HEADER$ 18 */ 19 /** @file */ 20 21 #include "ompi_config.h" 22 23 #include "ompi/constants.h" 24 #include "ompi/communicator/communicator.h" 25 #include "ompi/mca/coll/coll.h" 26 #include "opal/sys/atomic.h" 27 #include "coll_sm.h" 28 29 /** 30 * Shared memory barrier. 31 * 32 * Tree-based algorithm for a barrier: a fan in to rank 0 followed by 33 * a fan out using the barrier segments in the shared memory area. 34 * 35 * There are 2 sets of barrier buffers -- since there can only be, at 36 * most, 2 outstanding barriers at any time, there is no need for more 37 * than this. The generalized in-use flags, control, and data 38 * segments are not used. 39 * 40 * The general algorithm is for a given process to wait for its N 41 * children to fan in by monitoring a uint32_t in its barrier "in" 42 * buffer. When this value reaches N (i.e., each of the children have 43 * atomically incremented the value), then the process atomically 44 * increases the uint32_t in its parent's "in" buffer. Then the 45 * process waits for the parent to set a "1" in the process' "out" 46 * buffer. Once this happens, the process writes a "1" in each of its 47 * children's "out" buffers, and returns. 48 * 49 * There's corner cases, of course, such as the root that has no 50 * parent, and the leaves that have no children. But that's the 51 * general idea. 52 */ 53 int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm, 54 mca_coll_base_module_t *module) 55 { 56 int rank, buffer_set; 57 mca_coll_sm_comm_t *data; 58 uint32_t i, num_children; 59 volatile uint32_t *me_in, *me_out, *children = NULL; 60 opal_atomic_uint32_t *parent; 61 int uint_control_size; 62 mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module; 63 64 /* Lazily enable the module the first time we invoke a collective 65 on it */ 66 if (!sm_module->enabled) { 67 int ret; 68 if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) { 69 return ret; 70 } 71 } 72 73 uint_control_size = 74 mca_coll_sm_component.sm_control_size / sizeof(uint32_t); 75 data = sm_module->sm_comm_data; 76 rank = ompi_comm_rank(comm); 77 num_children = data->mcb_tree[rank].mcstn_num_children; 78 buffer_set = ((data->mcb_barrier_count++) % 2) * 2; 79 me_in = &data->mcb_barrier_control_me[buffer_set]; 80 me_out = (uint32_t*) 81 (((char*) me_in) + mca_coll_sm_component.sm_control_size); 82 83 /* Wait for my children to write to my *in* buffer */ 84 85 if (0 != num_children) { 86 /* Get children *out* buffer */ 87 children = data->mcb_barrier_control_children + buffer_set + 88 uint_control_size; 89 SPIN_CONDITION(*me_in == num_children, exit_label1); 90 *me_in = 0; 91 } 92 93 /* Send to my parent and wait for a response (don't poll on 94 parent's out buffer -- that would cause a lot of network 95 traffic / contention / faults / etc. Instead, children poll on 96 local memory and therefore only num_children messages are sent 97 across the network [vs. num_children *each* time all the 98 children poll] -- i.e., the memory is only being polled by one 99 process, and it is only changed *once* by an external 100 process) */ 101 102 if (0 != rank) { 103 /* Get parent *in* buffer */ 104 parent = &data->mcb_barrier_control_parent[buffer_set]; 105 opal_atomic_add (parent, 1); 106 107 SPIN_CONDITION(0 != *me_out, exit_label2); 108 *me_out = 0; 109 } 110 111 /* Send to my children */ 112 113 for (i = 0; i < num_children; ++i) { 114 children[i * uint_control_size * 4] = 1; 115 } 116 117 /* All done! End state of the control segment: 118 119 me_in: 0 120 me_out: 0 121 */ 122 123 return OMPI_SUCCESS; 124 }