root/ompi/mca/coll/base/coll_base_barrier.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_coll_base_sendrecv_zero
  2. ompi_coll_base_barrier_intra_doublering
  3. ompi_coll_base_barrier_intra_recursivedoubling
  4. ompi_coll_base_barrier_intra_bruck
  5. ompi_coll_base_barrier_intra_two_procs
  6. ompi_coll_base_barrier_intra_basic_linear
  7. ompi_coll_base_barrier_intra_tree

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  14  * Copyright (c) 2013      Los Alamos National Security, LLC. All Rights
  15  *                         reserved.
  16  * Copyright (c) 2015-2016 Research Organization for Information Science
  17  *                         and Technology (RIST). All rights reserved.
  18  * Copyright (c) 2017      IBM Corporation. All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  */
  25 
  26 #include "ompi_config.h"
  27 
  28 #include "mpi.h"
  29 #include "opal/util/bit_ops.h"
  30 #include "ompi/constants.h"
  31 #include "ompi/communicator/communicator.h"
  32 #include "ompi/mca/coll/coll.h"
  33 #include "ompi/mca/coll/base/coll_tags.h"
  34 #include "ompi/mca/pml/pml.h"
  35 #include "ompi/mca/coll/base/coll_base_functions.h"
  36 #include "coll_base_topo.h"
  37 #include "coll_base_util.h"
  38 
  39 /**
  40  * A quick version of the MPI_Sendreceive implemented for the barrier.
  41  * No actual data is moved across the wire, we use 0-byte messages to
  42  * signal a two peer synchronization.
  43  */
  44 static inline int
  45 ompi_coll_base_sendrecv_zero( int dest, int stag,
  46                               int source, int rtag,
  47                               MPI_Comm comm )
  48 
  49 {
  50     int rc, line = 0;
  51     ompi_request_t *req = MPI_REQUEST_NULL;
  52     ompi_status_public_t status;
  53 
  54     /* post new irecv */
  55     rc = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
  56                              comm, &req ));
  57     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
  58 
  59     /* send data to children */
  60     rc = MCA_PML_CALL(send( NULL, 0, MPI_BYTE, dest, stag,
  61                             MCA_PML_BASE_SEND_STANDARD, comm ));
  62     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
  63 
  64     rc = ompi_request_wait( &req, &status );
  65     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
  66 
  67     return (MPI_SUCCESS);
  68 
  69  error_handler:
  70     if( MPI_REQUEST_NULL != req ) {  /* cancel and complete the receive request */
  71         (void)ompi_request_cancel(req);
  72         (void)ompi_request_wait(&req, &status);
  73     }
  74 
  75     OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
  76                   __FILE__, line, rc));
  77     (void)line;  // silence compiler warning
  78     return rc;
  79 }
  80 
  81 /*
  82  * Barrier is ment to be a synchronous operation, as some BTLs can mark
  83  * a request done before its passed to the NIC and progress might not be made
  84  * elsewhere we cannot allow a process to exit the barrier until its last
  85  * [round of] sends are completed.
  86  *
  87  * It is last round of sends rather than 'last' individual send as each pair of
  88  * peers can use different channels/devices/btls and the receiver of one of
  89  * these sends might be forced to wait as the sender
  90  * leaves the collective and does not make progress until the next mpi call
  91  *
  92  */
  93 
  94 /*
  95  * Simple double ring version of barrier
  96  *
  97  * synchronous gurantee made by last ring of sends are synchronous
  98  *
  99  */
 100 int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
 101                                              mca_coll_base_module_t *module)
 102 {
 103     int rank, size, err = 0, line = 0, left, right;
 104 
 105     size = ompi_comm_size(comm);
 106     if( 1 == size )
 107         return OMPI_SUCCESS;
 108     rank = ompi_comm_rank(comm);
 109 
 110     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
 111 
 112     left = ((rank-1)%size);
 113     right = ((rank+1)%size);
 114 
 115     if (rank > 0) { /* receive message from the left */
 116         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
 117                                 MCA_COLL_BASE_TAG_BARRIER, comm,
 118                                 MPI_STATUS_IGNORE));
 119         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
 120     }
 121 
 122     /* Send message to the right */
 123     err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
 124                             MCA_COLL_BASE_TAG_BARRIER,
 125                             MCA_PML_BASE_SEND_STANDARD, comm));
 126     if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
 127 
 128     /* root needs to receive from the last node */
 129     if (rank == 0) {
 130         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
 131                                 MCA_COLL_BASE_TAG_BARRIER, comm,
 132                                 MPI_STATUS_IGNORE));
 133         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
 134     }
 135 
 136     /* Allow nodes to exit */
 137     if (rank > 0) { /* post Receive from left */
 138         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
 139                                 MCA_COLL_BASE_TAG_BARRIER, comm,
 140                                 MPI_STATUS_IGNORE));
 141         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
 142     }
 143 
 144     /* send message to the right one */
 145     err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
 146                             MCA_COLL_BASE_TAG_BARRIER,
 147                             MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
 148     if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
 149 
 150     /* rank 0 post receive from the last node */
 151     if (rank == 0) {
 152         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
 153                                 MCA_COLL_BASE_TAG_BARRIER, comm,
 154                                 MPI_STATUS_IGNORE));
 155         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
 156     }
 157 
 158     return MPI_SUCCESS;
 159 
 160  err_hndl:
 161     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
 162                  __FILE__, line, err, rank));
 163     (void)line;  // silence compiler warning
 164     return err;
 165 }
 166 
 167 
 168 /*
 169  * To make synchronous, uses sync sends and sync sendrecvs
 170  */
 171 
 172 int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
 173                                                     mca_coll_base_module_t *module)
 174 {
 175     int rank, size, adjsize, err, line, mask, remote;
 176 
 177     size = ompi_comm_size(comm);
 178     if( 1 == size )
 179         return OMPI_SUCCESS;
 180     rank = ompi_comm_rank(comm);
 181     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 182                  "ompi_coll_base_barrier_intra_recursivedoubling rank %d",
 183                  rank));
 184 
 185     /* do nearest power of 2 less than size calc */
 186     adjsize = opal_next_poweroftwo(size);
 187     adjsize >>= 1;
 188 
 189     /* if size is not exact power of two, perform an extra step */
 190     if (adjsize != size) {
 191         if (rank >= adjsize) {
 192             /* send message to lower ranked node */
 193             remote = rank - adjsize;
 194             err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
 195                                                remote, MCA_COLL_BASE_TAG_BARRIER,
 196                                                comm);
 197             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
 198 
 199         } else if (rank < (size - adjsize)) {
 200 
 201             /* receive message from high level rank */
 202             err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
 203                                     MCA_COLL_BASE_TAG_BARRIER, comm,
 204                                     MPI_STATUS_IGNORE));
 205 
 206             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
 207         }
 208     }
 209 
 210     /* exchange messages */
 211     if ( rank < adjsize ) {
 212         mask = 0x1;
 213         while ( mask < adjsize ) {
 214             remote = rank ^ mask;
 215             mask <<= 1;
 216             if (remote >= adjsize) continue;
 217 
 218             /* post receive from the remote node */
 219             err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
 220                                                remote, MCA_COLL_BASE_TAG_BARRIER,
 221                                                comm);
 222             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
 223         }
 224     }
 225 
 226     /* non-power of 2 case */
 227     if (adjsize != size) {
 228         if (rank < (size - adjsize)) {
 229             /* send enter message to higher ranked node */
 230             remote = rank + adjsize;
 231             err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
 232                                     MCA_COLL_BASE_TAG_BARRIER,
 233                                     MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
 234 
 235             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
 236         }
 237     }
 238 
 239     return MPI_SUCCESS;
 240 
 241  err_hndl:
 242     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
 243                  __FILE__, line, err, rank));
 244     (void)line;  // silence compiler warning
 245     return err;
 246 }
 247 
 248 
 249 /*
 250  * To make synchronous, uses sync sends and sync sendrecvs
 251  */
 252 
 253 int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
 254                                         mca_coll_base_module_t *module)
 255 {
 256     int rank, size, distance, to, from, err, line = 0;
 257 
 258     size = ompi_comm_size(comm);
 259     if( 1 == size )
 260         return MPI_SUCCESS;
 261     rank = ompi_comm_rank(comm);
 262     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 263                  "ompi_coll_base_barrier_intra_bruck rank %d", rank));
 264 
 265     /* exchange data with rank-2^k and rank+2^k */
 266     for (distance = 1; distance < size; distance <<= 1) {
 267         from = (rank + size - distance) % size;
 268         to   = (rank + distance) % size;
 269 
 270         /* send message to lower ranked node */
 271         err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
 272                                            from, MCA_COLL_BASE_TAG_BARRIER,
 273                                            comm);
 274         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
 275     }
 276 
 277     return MPI_SUCCESS;
 278 
 279  err_hndl:
 280     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
 281                  __FILE__, line, err, rank));
 282     (void)line;  // silence compiler warning
 283     return err;
 284 }
 285 
 286 
 287 /*
 288  * To make synchronous, uses sync sends and sync sendrecvs
 289  */
 290 /* special case for two processes */
 291 int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
 292                                             mca_coll_base_module_t *module)
 293 {
 294     int remote, size, err;
 295 
 296     size = ompi_comm_size(comm);
 297     if( 1 == size )
 298         return MPI_SUCCESS;
 299     if( 2 != ompi_comm_size(comm) ) {
 300         return MPI_ERR_UNSUPPORTED_OPERATION;
 301     }
 302 
 303     remote = ompi_comm_rank(comm);
 304     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 305                  "ompi_coll_base_barrier_intra_two_procs rank %d", remote));
 306 
 307     remote = (remote + 1) & 0x1;
 308 
 309     err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
 310                                        remote, MCA_COLL_BASE_TAG_BARRIER,
 311                                        comm);
 312     return (err);
 313 }
 314 
 315 
 316 /*
 317  * Linear functions are copied from the BASIC coll module
 318  * they do not segment the message and are simple implementations
 319  * but for some small number of nodes and/or small data sizes they
 320  * are just as fast as base/tree based segmenting operations
 321  * and as such may be selected by the decision functions
 322  * These are copied into this module due to the way we select modules
 323  * in V1. i.e. in V2 we will handle this differently and so will not
 324  * have to duplicate code.
 325  * GEF Oct05 after asking Jeff.
 326  */
 327 
 328 /* copied function (with appropriate renaming) starts here */
 329 
 330 int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
 331                                               mca_coll_base_module_t *module)
 332 {
 333     int i, err, rank, size, line;
 334     ompi_request_t** requests = NULL;
 335 
 336     size = ompi_comm_size(comm);
 337     if( 1 == size )
 338         return MPI_SUCCESS;
 339     rank = ompi_comm_rank(comm);
 340 
 341     /* All non-root send & receive zero-length message. */
 342     if (rank > 0) {
 343         err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
 344                                  MCA_COLL_BASE_TAG_BARRIER,
 345                                  MCA_PML_BASE_SEND_STANDARD, comm));
 346         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 347 
 348         err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
 349                                  MCA_COLL_BASE_TAG_BARRIER,
 350                                  comm, MPI_STATUS_IGNORE));
 351         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 352     }
 353 
 354     /* The root collects and broadcasts the messages. */
 355 
 356     else {
 357         requests = ompi_coll_base_comm_get_reqs(module->base_data, size);
 358         if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; }
 359 
 360         for (i = 1; i < size; ++i) {
 361             err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
 362                                      MCA_COLL_BASE_TAG_BARRIER, comm,
 363                                      &(requests[i])));
 364             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 365         }
 366         err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
 367         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 368         requests = NULL;  /* we're done the requests array is clean */
 369 
 370         for (i = 1; i < size; ++i) {
 371             err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
 372                                     MCA_COLL_BASE_TAG_BARRIER,
 373                                     MCA_PML_BASE_SEND_STANDARD, comm));
 374             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 375         }
 376     }
 377 
 378     /* All done */
 379     return MPI_SUCCESS;
 380  err_hndl:
 381     if( NULL != requests ) {
 382         /* find a real error code */
 383         if (MPI_ERR_IN_STATUS == err) {
 384             for( i = 0; i < size; i++ ) {
 385                 if (MPI_REQUEST_NULL == requests[i]) continue;
 386                 if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
 387                 err = requests[i]->req_status.MPI_ERROR;
 388                 break;
 389             }
 390         }
 391         ompi_coll_base_free_reqs(requests, size);
 392     }
 393     OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
 394                   __FILE__, line, err, rank) );
 395     (void)line;  // silence compiler warning
 396     return err;
 397 }
 398 /* copied function (with appropriate renaming) ends here */
 399 
 400 /*
 401  * Another recursive doubling type algorithm, but in this case
 402  * we go up the tree and back down the tree.
 403  */
 404 int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
 405                                        mca_coll_base_module_t *module)
 406 {
 407     int rank, size, depth, err, jump, partner;
 408 
 409     size = ompi_comm_size(comm);
 410     if( 1 == size )
 411         return MPI_SUCCESS;
 412     rank = ompi_comm_rank(comm);
 413     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 414                  "ompi_coll_base_barrier_intra_tree %d",
 415                  rank));
 416 
 417     /* Find the nearest power of 2 of the communicator size. */
 418     depth = opal_next_poweroftwo_inclusive(size);
 419 
 420     for (jump=1; jump<depth; jump<<=1) {
 421         partner = rank ^ jump;
 422         if (!(partner & (jump-1)) && partner < size) {
 423             if (partner > rank) {
 424                 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
 425                                          MCA_COLL_BASE_TAG_BARRIER, comm,
 426                                          MPI_STATUS_IGNORE));
 427                 if (MPI_SUCCESS != err)
 428                     return err;
 429             } else if (partner < rank) {
 430                 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
 431                                          MCA_COLL_BASE_TAG_BARRIER,
 432                                          MCA_PML_BASE_SEND_STANDARD, comm));
 433                 if (MPI_SUCCESS != err)
 434                     return err;
 435             }
 436         }
 437     }
 438 
 439     depth >>= 1;
 440     for (jump = depth; jump>0; jump>>=1) {
 441         partner = rank ^ jump;
 442         if (!(partner & (jump-1)) && partner < size) {
 443             if (partner > rank) {
 444                 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
 445                                          MCA_COLL_BASE_TAG_BARRIER,
 446                                          MCA_PML_BASE_SEND_STANDARD, comm));
 447                 if (MPI_SUCCESS != err)
 448                     return err;
 449             } else if (partner < rank) {
 450                 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
 451                                          MCA_COLL_BASE_TAG_BARRIER, comm,
 452                                          MPI_STATUS_IGNORE));
 453                 if (MPI_SUCCESS != err)
 454                     return err;
 455             }
 456         }
 457     }
 458 
 459     return MPI_SUCCESS;
 460 }

/* [<][>][^][v][top][bottom][index][help] */