ompi/mca/coll/base/coll_base

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
ompi_coll_base_allgather_intra_bruck
ompi_coll_base_allgather_intra_recursivedoubling
ompi_coll_base_allgather_intra_ring
ompi_coll_base_allgather_intra_neighborexchange
ompi_coll_base_allgather_intra_two_procs
ompi_coll_base_allgather_intra_basic_linear
   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2017 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2009      University of Houston. All rights reserved.
  13  * Copyright (c) 2013      Los Alamos National Security, LLC. All Rights
  14  *                         reserved.
  15  * Copyright (c) 2014-2016 Research Organization for Information Science
  16  *                         and Technology (RIST). All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 #include "ompi_config.h"
  25 
  26 #include "mpi.h"
  27 #include "opal/util/bit_ops.h"
  28 #include "ompi/constants.h"
  29 #include "ompi/datatype/ompi_datatype.h"
  30 #include "ompi/communicator/communicator.h"
  31 #include "ompi/mca/coll/coll.h"
  32 #include "ompi/mca/coll/base/coll_tags.h"
  33 #include "ompi/mca/coll/base/coll_base_functions.h"
  34 #include "coll_base_topo.h"
  35 #include "coll_base_util.h"
  36 
  37 /*
  38  * ompi_coll_base_allgather_intra_bruck
  39  *
  40  * Function:     allgather using O(log(N)) steps.
  41  * Accepts:      Same arguments as MPI_Allgather
  42  * Returns:      MPI_SUCCESS or error code
  43  *
  44  * Description:  Variation to All-to-all algorithm described by Bruck et al.in
  45  *               "Efficient Algorithms for All-to-all Communications
  46  *                in Multiport Message-Passing Systems"
  47  * Memory requirements:  non-zero ranks require shift buffer to perform final
  48  *               step in the algorithm.
  49  *
  50  * Example on 6 nodes:
  51  *   Initialization: everyone has its own buffer at location 0 in rbuf
  52  *                   This means if user specified MPI_IN_PLACE for sendbuf
  53  *                   we must copy our block from recvbuf to begining!
  54  *    #     0      1      2      3      4      5
  55  *         [0]    [1]    [2]    [3]    [4]    [5]
  56  *   Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
  57  *    #     0      1      2      3      4      5
  58  *         [0]    [1]    [2]    [3]    [4]    [5]
  59  *         [1]    [2]    [3]    [4]    [5]    [0]
  60  *   Step 1: send message to (rank - 2^1), receive message from (rank + 2^1)
  61  *           message contains all blocks from location 0 to 2^1*block size
  62  *    #     0      1      2      3      4      5
  63  *         [0]    [1]    [2]    [3]    [4]    [5]
  64  *         [1]    [2]    [3]    [4]    [5]    [0]
  65  *         [2]    [3]    [4]    [5]    [0]    [1]
  66  *         [3]    [4]    [5]    [0]    [1]    [2]
  67  *   Step 2: send message to (rank - 2^2), receive message from (rank + 2^2)
  68  *           message size is "all remaining blocks"
  69  *    #     0      1      2      3      4      5
  70  *         [0]    [1]    [2]    [3]    [4]    [5]
  71  *         [1]    [2]    [3]    [4]    [5]    [0]
  72  *         [2]    [3]    [4]    [5]    [0]    [1]
  73  *         [3]    [4]    [5]    [0]    [1]    [2]
  74  *         [4]    [5]    [0]    [1]    [2]    [3]
  75  *         [5]    [0]    [1]    [2]    [3]    [4]
  76  *    Finalization: Do a local shift to get data in correct place
  77  *    #     0      1      2      3      4      5
  78  *         [0]    [0]    [0]    [0]    [0]    [0]
  79  *         [1]    [1]    [1]    [1]    [1]    [1]
  80  *         [2]    [2]    [2]    [2]    [2]    [2]
  81  *         [3]    [3]    [3]    [3]    [3]    [3]
  82  *         [4]    [4]    [4]    [4]    [4]    [4]
  83  *         [5]    [5]    [5]    [5]    [5]    [5]
  84  */
  85 int ompi_coll_base_allgather_intra_bruck(const void *sbuf, int scount,
  86                                           struct ompi_datatype_t *sdtype,
  87                                           void* rbuf, int rcount,
  88                                           struct ompi_datatype_t *rdtype,
  89                                           struct ompi_communicator_t *comm,
  90                                           mca_coll_base_module_t *module)
  91 {
  92     int line = -1, rank, size, sendto, recvfrom, distance, blockcount, err = 0;
  93     ptrdiff_t rlb, rext;
  94     char *tmpsend = NULL, *tmprecv = NULL;
  95 
  96     size = ompi_comm_size(comm);
  97     rank = ompi_comm_rank(comm);
  98 
  99     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 100                  "coll:base:allgather_intra_bruck rank %d", rank));
 101 
 102     err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
 103     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 104 
 105     /* Initialization step:
 106        - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
 107        receive buffer, else
 108        - if rank r != 0, copy r^th block from receive buffer to block 0.
 109     */
 110     tmprecv = (char*) rbuf;
 111     if (MPI_IN_PLACE != sbuf) {
 112         tmpsend = (char*) sbuf;
 113         err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
 114         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 115 
 116     } else if (0 != rank) {  /* non root with MPI_IN_PLACE */
 117         tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 118         err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
 119         if (err < 0) { line = __LINE__; goto err_hndl; }
 120     }
 121 
 122     /* Communication step:
 123        At every step i, rank r:
 124        - doubles the distance
 125        - sends message which starts at begining of rbuf and has size
 126        (blockcount * rcount) to rank (r - distance)
 127        - receives message of size blockcount * rcount from rank (r + distance)
 128        at location (rbuf + distance * rcount * rext)
 129        - blockcount doubles until last step when only the remaining data is
 130        exchanged.
 131     */
 132     blockcount = 1;
 133     tmpsend = (char*) rbuf;
 134     for (distance = 1; distance < size; distance<<=1) {
 135 
 136         recvfrom = (rank + distance) % size;
 137         sendto = (rank - distance + size) % size;
 138 
 139         tmprecv = tmpsend + (ptrdiff_t)distance * (ptrdiff_t)rcount * rext;
 140 
 141         if (distance <= (size >> 1)) {
 142             blockcount = distance;
 143         } else {
 144             blockcount = size - distance;
 145         }
 146 
 147         /* Sendreceive */
 148         err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype,
 149                                        sendto, MCA_COLL_BASE_TAG_ALLGATHER,
 150                                        tmprecv, blockcount * rcount, rdtype,
 151                                        recvfrom, MCA_COLL_BASE_TAG_ALLGATHER,
 152                                        comm, MPI_STATUS_IGNORE, rank);
 153         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 154 
 155     }
 156 
 157     /* Finalization step:
 158        On all nodes except 0, data needs to be shifted locally:
 159        - create temporary shift buffer,
 160        see discussion in coll_basic_reduce.c about the size and begining
 161        of temporary buffer.
 162        - copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
 163        - move blocks [(size - rank) .. size] from rbuf to begining of rbuf
 164        - copy blocks from shift buffer starting at block [rank] in rbuf.
 165     */
 166     if (0 != rank) {
 167         char *free_buf = NULL, *shift_buf = NULL;
 168         ptrdiff_t span, gap = 0;
 169 
 170         span = opal_datatype_span(&rdtype->super, (int64_t)(size - rank) * rcount, &gap);
 171 
 172         free_buf = (char*)calloc(span, sizeof(char));
 173         if (NULL == free_buf) {
 174             line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
 175         }
 176         shift_buf = free_buf - gap;
 177 
 178         /* 1. copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer */
 179         err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount),
 180                                                   shift_buf, rbuf);
 181         if (err < 0) { line = __LINE__; goto err_hndl;  }
 182 
 183         /* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */
 184         tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext;
 185         err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
 186                                                   rbuf, tmpsend);
 187         if (err < 0) { line = __LINE__; goto err_hndl;  }
 188 
 189         /* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */
 190         tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 191         err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
 192                                                   tmprecv, shift_buf);
 193         if (err < 0) { line = __LINE__; goto err_hndl;  }
 194 
 195         free(free_buf);
 196     }
 197 
 198     return OMPI_SUCCESS;
 199 
 200  err_hndl:
 201     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
 202                  __FILE__, line, err, rank));
 203     (void)line;  // silence compiler warning
 204     return err;
 205 }
 206 
 207 /*
 208  * ompi_coll_base_allgather_intra_recursivedoubling
 209  *
 210  * Function:     allgather using O(log(N)) steps.
 211  * Accepts:      Same arguments as MPI_Allgather
 212  * Returns:      MPI_SUCCESS or error code
 213  *
 214  * Description:  Recursive doubling algorithm for MPI_Allgather implementation.
 215  *               This algorithm is used in MPICH-2 for small- and medium-sized
 216  *               messages on power-of-two processes.
 217  *
 218  * Limitation:   Current implementation only works on power-of-two number of
 219  *               processes.
 220  *               In case this algorithm is invoked on non-power-of-two
 221  *               processes, Bruck algorithm will be invoked.
 222  *
 223  * Memory requirements:
 224  *               No additional memory requirements beyond user-supplied buffers.
 225  *
 226  * Example on 4 nodes:
 227  *   Initialization: everyone has its own buffer at location rank in rbuf
 228  *    #     0      1      2      3
 229  *         [0]    [ ]    [ ]    [ ]
 230  *         [ ]    [1]    [ ]    [ ]
 231  *         [ ]    [ ]    [2]    [ ]
 232  *         [ ]    [ ]    [ ]    [3]
 233  *   Step 0: exchange data with (rank ^ 2^0)
 234  *    #     0      1      2      3
 235  *         [0]    [0]    [ ]    [ ]
 236  *         [1]    [1]    [ ]    [ ]
 237  *         [ ]    [ ]    [2]    [2]
 238  *         [ ]    [ ]    [3]    [3]
 239  *   Step 1: exchange data with (rank ^ 2^1) (if you can)
 240  *    #     0      1      2      3
 241  *         [0]    [0]    [0]    [0]
 242  *         [1]    [1]    [1]    [1]
 243  *         [2]    [2]    [2]    [2]
 244  *         [3]    [3]    [3]    [3]
 245  *
 246  *  TODO: Modify the algorithm to work with any number of nodes.
 247  *        We can modify code to use identical implementation like MPICH-2:
 248  *        - using recursive-halving algorithm, at the end of each step,
 249  *          determine if there are nodes who did not exchange their data in that
 250  *          step, and send them appropriate messages.
 251  */
 252 int
 253 ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, int scount,
 254                                                   struct ompi_datatype_t *sdtype,
 255                                                   void* rbuf, int rcount,
 256                                                   struct ompi_datatype_t *rdtype,
 257                                                   struct ompi_communicator_t *comm,
 258                                                   mca_coll_base_module_t *module)
 259 {
 260     int line = -1, rank, size, pow2size, err;
 261     int remote, distance, sendblocklocation;
 262     ptrdiff_t rlb, rext;
 263     char *tmpsend = NULL, *tmprecv = NULL;
 264 
 265     size = ompi_comm_size(comm);
 266     rank = ompi_comm_rank(comm);
 267 
 268     pow2size = opal_next_poweroftwo (size);
 269     pow2size >>=1;
 270 
 271     /* Current implementation only handles power-of-two number of processes.
 272        If the function was called on non-power-of-two number of processes,
 273        print warning and call bruck allgather algorithm with same parameters.
 274     */
 275     if (pow2size != size) {
 276         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 277                      "coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
 278                      size));
 279 
 280         return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
 281                                                      rbuf, rcount, rdtype,
 282                                                      comm, module);
 283     }
 284 
 285     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 286                  "coll:base:allgather_intra_recursivedoubling rank %d, size %d",
 287                  rank, size));
 288 
 289     err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
 290     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 291 
 292     /* Initialization step:
 293        - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
 294        receive buffer
 295     */
 296     if (MPI_IN_PLACE != sbuf) {
 297         tmpsend = (char*) sbuf;
 298         tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 299         err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
 300         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 301 
 302     }
 303 
 304     /* Communication step:
 305        At every step i, rank r:
 306        - exchanges message with rank remote = (r ^ 2^i).
 307 
 308     */
 309     sendblocklocation = rank;
 310     for (distance = 0x1; distance < size; distance<<=1) {
 311         remote = rank ^ distance;
 312 
 313         if (rank < remote) {
 314             tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext;
 315             tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext;
 316         } else {
 317             tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext;
 318             tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext;
 319             sendblocklocation -= distance;
 320         }
 321 
 322         /* Sendreceive */
 323         err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
 324                                        remote, MCA_COLL_BASE_TAG_ALLGATHER,
 325                                        tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
 326                                        remote, MCA_COLL_BASE_TAG_ALLGATHER,
 327                                        comm, MPI_STATUS_IGNORE, rank);
 328         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 329 
 330     }
 331 
 332     return OMPI_SUCCESS;
 333 
 334  err_hndl:
 335     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
 336                  __FILE__, line, err, rank));
 337     (void)line;  // silence compiler warning
 338     return err;
 339 }
 340 
 341 
 342 
 343 /*
 344  * ompi_coll_base_allgather_intra_ring
 345  *
 346  * Function:     allgather using O(N) steps.
 347  * Accepts:      Same arguments as MPI_Allgather
 348  * Returns:      MPI_SUCCESS or error code
 349  *
 350  * Description:  Ring algorithm for all gather.
 351  *               At every step i, rank r receives message from rank (r - 1)
 352  *               containing data from rank (r - i - 1) and sends message to rank
 353  *               (r + 1) containing data from rank (r - i), with wrap arounds.
 354  * Memory requirements:
 355  *               No additional memory requirements.
 356  *
 357  */
 358 int ompi_coll_base_allgather_intra_ring(const void *sbuf, int scount,
 359                                          struct ompi_datatype_t *sdtype,
 360                                          void* rbuf, int rcount,
 361                                          struct ompi_datatype_t *rdtype,
 362                                          struct ompi_communicator_t *comm,
 363                                          mca_coll_base_module_t *module)
 364 {
 365     int line = -1, rank, size, err, sendto, recvfrom, i, recvdatafrom, senddatafrom;
 366     ptrdiff_t rlb, rext;
 367     char *tmpsend = NULL, *tmprecv = NULL;
 368 
 369     size = ompi_comm_size(comm);
 370     rank = ompi_comm_rank(comm);
 371 
 372     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 373                  "coll:base:allgather_intra_ring rank %d", rank));
 374 
 375     err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
 376     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 377 
 378     /* Initialization step:
 379        - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
 380        of receive buffer
 381     */
 382     tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 383     if (MPI_IN_PLACE != sbuf) {
 384         tmpsend = (char*) sbuf;
 385         err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
 386         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 387     }
 388 
 389     /* Communication step:
 390        At every step i: 0 .. (P-1), rank r:
 391        - receives message from [(r - 1 + size) % size] containing data from rank
 392        [(r - i - 1 + size) % size]
 393        - sends message to rank [(r + 1) % size] containing data from rank
 394        [(r - i + size) % size]
 395        - sends message which starts at begining of rbuf and has size
 396     */
 397     sendto = (rank + 1) % size;
 398     recvfrom  = (rank - 1 + size) % size;
 399 
 400     for (i = 0; i < size - 1; i++) {
 401         recvdatafrom = (rank - i - 1 + size) % size;
 402         senddatafrom = (rank - i + size) % size;
 403 
 404         tmprecv = (char*)rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)rcount * rext;
 405         tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext;
 406 
 407         /* Sendreceive */
 408         err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto,
 409                                        MCA_COLL_BASE_TAG_ALLGATHER,
 410                                        tmprecv, rcount, rdtype, recvfrom,
 411                                        MCA_COLL_BASE_TAG_ALLGATHER,
 412                                        comm, MPI_STATUS_IGNORE, rank);
 413         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 414 
 415     }
 416 
 417     return OMPI_SUCCESS;
 418 
 419  err_hndl:
 420     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
 421                  __FILE__, line, err, rank));
 422     (void)line;  // silence compiler warning
 423     return err;
 424 }
 425 
 426 /*
 427  * ompi_coll_base_allgather_intra_neighborexchange
 428  *
 429  * Function:     allgather using N/2 steps (O(N))
 430  * Accepts:      Same arguments as MPI_Allgather
 431  * Returns:      MPI_SUCCESS or error code
 432  *
 433  * Description:  Neighbor Exchange algorithm for allgather.
 434  *               Described by Chen et.al. in
 435  *               "Performance Evaluation of Allgather Algorithms on
 436  *                Terascale Linux Cluster with Fast Ethernet",
 437  *               Proceedings of the Eighth International Conference on
 438  *               High-Performance Computing inn Asia-Pacific Region
 439  *               (HPCASIA'05), 2005
 440  *
 441  *               Rank r exchanges message with one of its neighbors and
 442  *               forwards the data further in the next step.
 443  *
 444  *               No additional memory requirements.
 445  *
 446  * Limitations:  Algorithm works only on even number of processes.
 447  *               For odd number of processes we switch to ring algorithm.
 448  *
 449  * Example on 6 nodes:
 450  *  Initial state
 451  *    #     0      1      2      3      4      5
 452  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
 453  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
 454  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
 455  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
 456  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
 457  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
 458  *   Step 0:
 459  *    #     0      1      2      3      4      5
 460  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
 461  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
 462  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
 463  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
 464  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
 465  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
 466  *   Step 1:
 467  *    #     0      1      2      3      4      5
 468  *         [0]    [0]    [0]    [ ]    [ ]    [0]
 469  *         [1]    [1]    [1]    [ ]    [ ]    [1]
 470  *         [ ]    [2]    [2]    [2]    [2]    [ ]
 471  *         [ ]    [3]    [3]    [3]    [3]    [ ]
 472  *         [4]    [ ]    [ ]    [4]    [4]    [4]
 473  *         [5]    [ ]    [ ]    [5]    [5]    [5]
 474  *   Step 2:
 475  *    #     0      1      2      3      4      5
 476  *         [0]    [0]    [0]    [0]    [0]    [0]
 477  *         [1]    [1]    [1]    [1]    [1]    [1]
 478  *         [2]    [2]    [2]    [2]    [2]    [2]
 479  *         [3]    [3]    [3]    [3]    [3]    [3]
 480  *         [4]    [4]    [4]    [4]    [4]    [4]
 481  *         [5]    [5]    [5]    [5]    [5]    [5]
 482  */
 483 int
 484 ompi_coll_base_allgather_intra_neighborexchange(const void *sbuf, int scount,
 485                                                  struct ompi_datatype_t *sdtype,
 486                                                  void* rbuf, int rcount,
 487                                                  struct ompi_datatype_t *rdtype,
 488                                                  struct ompi_communicator_t *comm,
 489                                                  mca_coll_base_module_t *module)
 490 {
 491     int line = -1, rank, size, i, even_rank, err;
 492     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
 493     ptrdiff_t rlb, rext;
 494     char *tmpsend = NULL, *tmprecv = NULL;
 495 
 496     size = ompi_comm_size(comm);
 497     rank = ompi_comm_rank(comm);
 498 
 499     if (size % 2) {
 500         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 501                      "coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
 502                      size));
 503         return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
 504                                                     rbuf, rcount, rdtype,
 505                                                     comm, module);
 506     }
 507 
 508     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 509                  "coll:base:allgather_intra_neighborexchange rank %d", rank));
 510 
 511     err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
 512     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 513 
 514     /* Initialization step:
 515        - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
 516        of receive buffer
 517     */
 518     tmprecv = (char*) rbuf + (ptrdiff_t)rank *(ptrdiff_t) rcount * rext;
 519     if (MPI_IN_PLACE != sbuf) {
 520         tmpsend = (char*) sbuf;
 521         err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
 522         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 523     }
 524 
 525     /* Determine neighbors, order in which blocks will arrive, etc. */
 526     even_rank = !(rank % 2);
 527     if (even_rank) {
 528         neighbor[0] = (rank + 1) % size;
 529         neighbor[1] = (rank - 1 + size) % size;
 530         recv_data_from[0] = rank;
 531         recv_data_from[1] = rank;
 532         offset_at_step[0] = (+2);
 533         offset_at_step[1] = (-2);
 534     } else {
 535         neighbor[0] = (rank - 1 + size) % size;
 536         neighbor[1] = (rank + 1) % size;
 537         recv_data_from[0] = neighbor[0];
 538         recv_data_from[1] = neighbor[0];
 539         offset_at_step[0] = (-2);
 540         offset_at_step[1] = (+2);
 541     }
 542 
 543     /* Communication loop:
 544        - First step is special: exchange a single block with neighbor[0].
 545        - Rest of the steps:
 546        update recv_data_from according to offset, and
 547        exchange two blocks with appropriate neighbor.
 548        the send location becomes previous receve location.
 549     */
 550     tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext;
 551     tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 552     /* Sendreceive */
 553     err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
 554                                    MCA_COLL_BASE_TAG_ALLGATHER,
 555                                    tmprecv, rcount, rdtype, neighbor[0],
 556                                    MCA_COLL_BASE_TAG_ALLGATHER,
 557                                    comm, MPI_STATUS_IGNORE, rank);
 558     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 559 
 560     /* Determine initial sending location */
 561     if (even_rank) {
 562         send_data_from = rank;
 563     } else {
 564         send_data_from = recv_data_from[0];
 565     }
 566 
 567     for (i = 1; i < (size / 2); i++) {
 568         const int i_parity = i % 2;
 569         recv_data_from[i_parity] =
 570             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
 571 
 572         tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext;
 573         tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext;
 574 
 575         /* Sendreceive */
 576         err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
 577                                        neighbor[i_parity],
 578                                        MCA_COLL_BASE_TAG_ALLGATHER,
 579                                        tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
 580                                        neighbor[i_parity],
 581                                        MCA_COLL_BASE_TAG_ALLGATHER,
 582                                        comm, MPI_STATUS_IGNORE, rank);
 583         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 584 
 585         send_data_from = recv_data_from[i_parity];
 586     }
 587 
 588     return OMPI_SUCCESS;
 589 
 590  err_hndl:
 591     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
 592                  __FILE__, line, err, rank));
 593     (void)line;  // silence compiler warning
 594     return err;
 595 }
 596 
 597 
 598 int ompi_coll_base_allgather_intra_two_procs(const void *sbuf, int scount,
 599                                               struct ompi_datatype_t *sdtype,
 600                                               void* rbuf, int rcount,
 601                                               struct ompi_datatype_t *rdtype,
 602                                               struct ompi_communicator_t *comm,
 603                                               mca_coll_base_module_t *module)
 604 {
 605     int line = -1, err, rank, remote;
 606     char *tmpsend = NULL, *tmprecv = NULL;
 607     ptrdiff_t rext, lb;
 608 
 609     rank = ompi_comm_rank(comm);
 610 
 611     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 612                  "ompi_coll_base_allgather_intra_two_procs rank %d", rank));
 613 
 614     if (2 != ompi_comm_size(comm)) {
 615         return MPI_ERR_UNSUPPORTED_OPERATION;
 616     }
 617 
 618     err = ompi_datatype_get_extent (rdtype, &lb, &rext);
 619     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 620 
 621     /* Exchange data:
 622        - compute source and destinations
 623        - send receive data
 624     */
 625     remote  = rank ^ 0x1;
 626 
 627     tmpsend = (char*)sbuf;
 628     if (MPI_IN_PLACE == sbuf) {
 629         tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
 630         scount = rcount;
 631         sdtype = rdtype;
 632     }
 633     tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext;
 634 
 635     err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
 636                                    MCA_COLL_BASE_TAG_ALLGATHER,
 637                                    tmprecv, rcount, rdtype, remote,
 638                                    MCA_COLL_BASE_TAG_ALLGATHER,
 639                                    comm, MPI_STATUS_IGNORE, rank);
 640     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 641 
 642     /* Place your data in correct location if necessary */
 643     if (MPI_IN_PLACE != sbuf) {
 644         err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
 645                                    (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype);
 646         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 647     }
 648 
 649     return MPI_SUCCESS;
 650 
 651  err_hndl:
 652     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
 653                  __FILE__, line, err, rank));
 654     (void)line;  // silence compiler warning
 655     return err;
 656 }
 657 
 658 
 659 /*
 660  * Linear functions are copied from the BASIC coll module
 661  * they do not segment the message and are simple implementations
 662  * but for some small number of nodes and/or small data sizes they
 663  * are just as fast as base/tree based segmenting operations
 664  * and as such may be selected by the decision functions
 665  * These are copied into this module due to the way we select modules
 666  * in V1. i.e. in V2 we will handle this differently and so will not
 667  * have to duplicate code.
 668  * JPG following the examples from other coll_base implementations. Dec06.
 669  */
 670 
 671 /* copied function (with appropriate renaming) starts here */
 672 
 673 /*
 674  *    allgather_intra_basic_linear
 675  *
 676  *    Function:    - allgather using other MPI collections
 677  *    Accepts:    - same as MPI_Allgather()
 678  *    Returns:    - MPI_SUCCESS or error code
 679  */
 680 int
 681 ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount,
 682                                              struct ompi_datatype_t *sdtype,
 683                                              void *rbuf,
 684                                              int rcount,
 685                                              struct ompi_datatype_t *rdtype,
 686                                              struct ompi_communicator_t *comm,
 687                                              mca_coll_base_module_t *module)
 688 {
 689     int err;
 690     ptrdiff_t lb, extent;
 691 
 692     /* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to
 693        allocate temp buffer) -- note that rank 0 can use IN_PLACE
 694        natively, and we can just alias the right position in rbuf
 695        as sbuf and avoid using a temporary buffer if gather is
 696        implemented correctly */
 697     if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) {
 698         ompi_datatype_get_extent(rdtype, &lb, &extent);
 699         sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
 700         sdtype = rdtype;
 701         scount = rcount;
 702     }
 703 
 704     /* Gather and broadcast. */
 705 
 706     err = comm->c_coll->coll_gather(sbuf, scount, sdtype,
 707                                    rbuf, rcount, rdtype,
 708                                    0, comm, comm->c_coll->coll_gather_module);
 709     if (MPI_SUCCESS == err) {
 710         size_t length = (ptrdiff_t)rcount * ompi_comm_size(comm);
 711         if( length < (size_t)INT_MAX ) {
 712             err = comm->c_coll->coll_bcast(rbuf, (ptrdiff_t)rcount * ompi_comm_size(comm), rdtype,
 713                                           0, comm, comm->c_coll->coll_bcast_module);
 714         } else {
 715             ompi_datatype_t* temptype;
 716             ompi_datatype_create_contiguous(ompi_comm_size(comm), rdtype, &temptype);
 717             ompi_datatype_commit(&temptype);
 718             err = comm->c_coll->coll_bcast(rbuf, rcount, temptype,
 719                                           0, comm, comm->c_coll->coll_bcast_module);
 720             ompi_datatype_destroy(&temptype);
 721         }
 722     }
 723 
 724     /* All done */
 725 
 726     return err;
 727 }
 728 
 729 /* copied function (with appropriate renaming) ends here */
/* [<][>][^][v][top][bottom][index][help] */
root/ompi/mca/coll/base/coll_base_allgather.c

DEFINITIONS