ompi/mca/coll/tuned/coll_tuned_decision

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
ompi_coll_tuned_allreduce_intra_dec_fixed
ompi_coll_tuned_alltoall_intra_dec_fixed
ompi_coll_tuned_alltoallv_intra_dec_fixed
ompi_coll_tuned_barrier_intra_dec_fixed
ompi_coll_tuned_bcast_intra_dec_fixed
ompi_coll_tuned_reduce_intra_dec_fixed
ompi_coll_tuned_reduce_scatter_intra_dec_fixed
ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed
ompi_coll_tuned_allgather_intra_dec_fixed
ompi_coll_tuned_allgatherv_intra_dec_fixed
ompi_coll_tuned_gather_intra_dec_fixed
ompi_coll_tuned_scatter_intra_dec_fixed
   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2015 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  14  * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
  15  *                         reserved.
  16  * Copyright (c) 2015-2018 Research Organization for Information Science
  17  *                         and Technology (RIST). All rights reserved.
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 
  25 #include "ompi_config.h"
  26 
  27 #include "mpi.h"
  28 #include "opal/util/bit_ops.h"
  29 #include "ompi/datatype/ompi_datatype.h"
  30 #include "ompi/communicator/communicator.h"
  31 #include "ompi/mca/coll/coll.h"
  32 #include "ompi/mca/coll/base/coll_tags.h"
  33 #include "ompi/op/op.h"
  34 #include "coll_tuned.h"
  35 
  36 /*
  37  *  allreduce_intra
  38  *
  39  *  Function:   - allreduce using other MPI collectives
  40  *  Accepts:    - same as MPI_Allreduce()
  41  *  Returns:    - MPI_SUCCESS or error code
  42  */
  43 int
  44 ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, int count,
  45                                           struct ompi_datatype_t *dtype,
  46                                           struct ompi_op_t *op,
  47                                           struct ompi_communicator_t *comm,
  48                                           mca_coll_base_module_t *module)
  49 {
  50     size_t dsize, block_dsize;
  51     int comm_size = ompi_comm_size(comm);
  52     const size_t intermediate_message = 10000;
  53     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed"));
  54 
  55     /**
  56      * Decision function based on MX results from the Grig cluster at UTK.
  57      *
  58      * Currently, linear, recursive doubling, and nonoverlapping algorithms
  59      * can handle both commutative and non-commutative operations.
  60      * Ring algorithm does not support non-commutative operations.
  61      */
  62     ompi_datatype_type_size(dtype, &dsize);
  63     block_dsize = dsize * (ptrdiff_t)count;
  64 
  65     if (block_dsize < intermediate_message) {
  66         return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
  67                                                                  count, dtype,
  68                                                                  op, comm, module));
  69     }
  70 
  71     if( ompi_op_is_commute(op) && (count > comm_size) ) {
  72         const size_t segment_size = 1 << 20; /* 1 MB */
  73         if (((size_t)comm_size * (size_t)segment_size >= block_dsize)) {
  74             return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype,
  75                                                         op, comm, module));
  76         } else {
  77             return (ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf,
  78                                                                   count, dtype,
  79                                                                   op, comm, module,
  80                                                                   segment_size));
  81         }
  82     }
  83 
  84     return (ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count,
  85                                                           dtype, op, comm, module));
  86 }
  87 
  88 /*
  89  *      alltoall_intra_dec
  90  *
  91  *      Function:       - seletects alltoall algorithm to use
  92  *      Accepts:        - same arguments as MPI_Alltoall()
  93  *      Returns:        - MPI_SUCCESS or error code
  94  */
  95 
  96 int ompi_coll_tuned_alltoall_intra_dec_fixed(const void *sbuf, int scount,
  97                                              struct ompi_datatype_t *sdtype,
  98                                              void* rbuf, int rcount,
  99                                              struct ompi_datatype_t *rdtype,
 100                                              struct ompi_communicator_t *comm,
 101                                              mca_coll_base_module_t *module)
 102 {
 103     int communicator_size;
 104     size_t dsize, block_dsize;
 105 #if 0
 106     size_t total_dsize;
 107 #endif
 108 
 109     communicator_size = ompi_comm_size(comm);
 110 
 111     /* special case */
 112     if (communicator_size==2) {
 113         return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype,
 114                                                        rbuf, rcount, rdtype,
 115                                                        comm, module);
 116     }
 117 
 118     /* Decision function based on measurement on Grig cluster at
 119        the University of Tennessee (2GB MX) up to 64 nodes.
 120        Has better performance for messages of intermediate sizes than the old one */
 121     /* determine block size */
 122     if (MPI_IN_PLACE != sbuf) {
 123         ompi_datatype_type_size(sdtype, &dsize);
 124     } else {
 125         ompi_datatype_type_size(rdtype, &dsize);
 126     }
 127     block_dsize = dsize * (ptrdiff_t)scount;
 128 
 129     if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_small_msg)
 130                                               && (communicator_size > 12)) {
 131         return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype,
 132                                                    rbuf, rcount, rdtype,
 133                                                    comm, module);
 134 
 135     } else if (block_dsize < (size_t) ompi_coll_tuned_alltoall_intermediate_msg) {
 136         return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
 137                                                           rbuf, rcount, rdtype,
 138                                                           comm, module);
 139     }
 140 
 141     return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,
 142                                                   rbuf, rcount, rdtype,
 143                                                   comm, module);
 144 
 145 #if 0
 146     /* previous decision */
 147 
 148     /* else we need data size for decision function */
 149     ompi_datatype_type_size(sdtype, &dsize);
 150     total_dsize = dsize * scount * communicator_size;   /* needed for decision */
 151 
 152     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed rank %d com_size %d msg_length %ld",
 153                  ompi_comm_rank(comm), communicator_size, total_dsize));
 154 
 155     if (communicator_size >= 12 && total_dsize <= 768) {
 156         return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 157     }
 158     if (total_dsize <= 131072) {
 159         return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 160     }
 161     return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 162 #endif
 163 }
 164 
 165 /*
 166  *      Function:       - selects alltoallv algorithm to use
 167  *      Accepts:        - same arguments as MPI_Alltoallv()
 168  *      Returns:        - MPI_SUCCESS or error code
 169  */
 170 int ompi_coll_tuned_alltoallv_intra_dec_fixed(const void *sbuf, const int *scounts, const int *sdisps,
 171                                               struct ompi_datatype_t *sdtype,
 172                                               void *rbuf, const int *rcounts, const int *rdisps,
 173                                               struct ompi_datatype_t *rdtype,
 174                                               struct ompi_communicator_t *comm,
 175                                               mca_coll_base_module_t *module)
 176 {
 177     /* For starters, just keep the original algorithm. */
 178     return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
 179                                                    rbuf, rcounts, rdisps,rdtype,
 180                                                    comm, module);
 181 }
 182 
 183 
 184 /*
 185  *      barrier_intra_dec
 186  *
 187  *      Function:       - seletects barrier algorithm to use
 188  *      Accepts:        - same arguments as MPI_Barrier()
 189  *      Returns:        - MPI_SUCCESS or error code (passed from the barrier implementation)
 190  */
 191 int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
 192                                             mca_coll_base_module_t *module)
 193 {
 194     int communicator_size = ompi_comm_size(comm);
 195 
 196     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d",
 197                  communicator_size));
 198 
 199     if( 2 == communicator_size )
 200         return ompi_coll_base_barrier_intra_two_procs(comm, module);
 201     /**
 202      * Basic optimisation. If we have a power of 2 number of nodes
 203      * the use the recursive doubling algorithm, otherwise
 204      * bruck is the one we want.
 205      */
 206     {
 207         bool has_one = false;
 208         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 209             if( communicator_size & 0x1 ) {
 210                 if( has_one )
 211                     return ompi_coll_base_barrier_intra_bruck(comm, module);
 212                 has_one = true;
 213             }
 214         }
 215     }
 216     return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
 217 }
 218 
 219 
 220 /*
 221  *      bcast_intra_dec
 222  *
 223  *      Function:       - seletects broadcast algorithm to use
 224  *      Accepts:        - same arguments as MPI_Bcast()
 225  *      Returns:        - MPI_SUCCESS or error code (passed from the bcast implementation)
 226  */
 227 int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
 228                                           struct ompi_datatype_t *datatype, int root,
 229                                           struct ompi_communicator_t *comm,
 230                                           mca_coll_base_module_t *module)
 231 {
 232     /* Decision function based on MX results for
 233        messages up to 36MB and communicator sizes up to 64 nodes */
 234     const size_t small_message_size = 2048;
 235     const size_t intermediate_message_size = 370728;
 236     const double a_p16  = 3.2118e-6; /* [1 / byte] */
 237     const double b_p16  = 8.7936;
 238     const double a_p64  = 2.3679e-6; /* [1 / byte] */
 239     const double b_p64  = 1.1787;
 240     const double a_p128 = 1.6134e-6; /* [1 / byte] */
 241     const double b_p128 = 2.1102;
 242 
 243     int communicator_size;
 244     int segsize = 0;
 245     size_t message_size, dsize;
 246 
 247     communicator_size = ompi_comm_size(comm);
 248 
 249     /* else we need data size for decision function */
 250     ompi_datatype_type_size(datatype, &dsize);
 251     message_size = dsize * (unsigned long)count;   /* needed for decision */
 252 
 253     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_dec_fixed"
 254                  " root %d rank %d com_size %d msg_length %lu",
 255                  root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
 256 
 257     /* Handle messages of small and intermediate size, and
 258        single-element broadcasts */
 259     if ((message_size < small_message_size) || (count <= 1)) {
 260         /* Binomial without segmentation */
 261         segsize = 0;
 262         return  ompi_coll_base_bcast_intra_binomial(buff, count, datatype,
 263                                                     root, comm, module,
 264                                                     segsize);
 265 
 266     } else if (message_size < intermediate_message_size) {
 267         /* SplittedBinary with 1KB segments */
 268         segsize = 1024;
 269         return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
 270                                                         root, comm, module,
 271                                                         segsize);
 272 
 273     }
 274     /* Handle large message sizes */
 275     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 276         /* Pipeline with 128KB segments */
 277         segsize = 1024  << 7;
 278         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 279                                                    root, comm, module,
 280                                                    segsize);
 281 
 282     } else if (communicator_size < 13) {
 283         /* Split Binary with 8KB segments */
 284         segsize = 1024 << 3;
 285         return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
 286                                                         root, comm, module,
 287                                                         segsize);
 288 
 289     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 290         /* Pipeline with 64KB segments */
 291         segsize = 1024 << 6;
 292         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 293                                                    root, comm, module,
 294                                                    segsize);
 295 
 296     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 297         /* Pipeline with 16KB segments */
 298         segsize = 1024 << 4;
 299         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 300                                                    root, comm, module,
 301                                                    segsize);
 302 
 303     }
 304 
 305     /* Pipeline with 8KB segments */
 306     segsize = 1024 << 3;
 307     return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 308                                                root, comm, module,
 309                                                segsize);
 310 #if 0
 311     /* this is based on gige measurements */
 312 
 313     if (communicator_size  < 4) {
 314         return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
 315     }
 316     if (communicator_size == 4) {
 317         if (message_size < 524288) segsize = 0;
 318         else segsize = 16384;
 319         return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 320     }
 321     if (communicator_size <= 8 && message_size < 4096) {
 322         return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
 323     }
 324     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 325         segsize = 16384;
 326         return  ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 327     }
 328     if (message_size >= 524288) {
 329         segsize = 16384;
 330         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype, root, comm, module, segsize);
 331     }
 332     segsize = 0;
 333     /* once tested can swap this back in */
 334     /* return ompi_coll_base_bcast_intra_bmtree(buff, count, datatype, root, comm, segsize); */
 335     return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 336 #endif  /* 0 */
 337 }
 338 
 339 /*
 340  *      reduce_intra_dec
 341  *
 342  *      Function:       - seletects reduce algorithm to use
 343  *      Accepts:        - same arguments as MPI_reduce()
 344  *      Returns:        - MPI_SUCCESS or error code (passed from the reduce implementation)
 345  *
 346  */
 347 int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
 348                                             int count, struct ompi_datatype_t* datatype,
 349                                             struct ompi_op_t* op, int root,
 350                                             struct ompi_communicator_t* comm,
 351                                             mca_coll_base_module_t *module)
 352 {
 353     int communicator_size, segsize = 0;
 354     size_t message_size, dsize;
 355     const double a1 =  0.6016 / 1024.0; /* [1/B] */
 356     const double b1 =  1.3496;
 357     const double a2 =  0.0410 / 1024.0; /* [1/B] */
 358     const double b2 =  9.7128;
 359     const double a3 =  0.0422 / 1024.0; /* [1/B] */
 360     const double b3 =  1.1614;
 361     const double a4 =  0.0033 / 1024.0; /* [1/B] */
 362     const double b4 =  1.6761;
 363 
 364     const int max_requests = 0; /* no limit on # of outstanding requests */
 365 
 366     communicator_size = ompi_comm_size(comm);
 367 
 368     /* need data size for decision function */
 369     ompi_datatype_type_size(datatype, &dsize);
 370     message_size = dsize * (ptrdiff_t)count;   /* needed for decision */
 371 
 372     /**
 373      * If the operation is non commutative we currently have choice of linear
 374      * or in-order binary tree algorithm.
 375      */
 376     if( !ompi_op_is_commute(op) ) {
 377         if ((communicator_size < 12) && (message_size < 2048)) {
 378             return ompi_coll_base_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 379         }
 380         return ompi_coll_base_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 381                                                              0, max_requests);
 382     }
 383 
 384     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
 385                  "root %d rank %d com_size %d msg_length %lu",
 386                  root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
 387 
 388     if ((communicator_size < 8) && (message_size < 512)){
 389         /* Linear_0K */
 390         return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
 391     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 392                (message_size < 2048) || (count <= 1)) {
 393         /* Binomial_0K */
 394         segsize = 0;
 395         return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 396                                                      segsize, max_requests);
 397     } else if (communicator_size > (a1 * message_size + b1)) {
 398         /* Binomial_1K */
 399         segsize = 1024;
 400         return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 401                                                      segsize, max_requests);
 402     } else if (communicator_size > (a2 * message_size + b2)) {
 403         /* Pipeline_1K */
 404         segsize = 1024;
 405         return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 406                                                     segsize, max_requests);
 407     } else if (communicator_size > (a3 * message_size + b3)) {
 408         /* Binary_32K */
 409         segsize = 32*1024;
 410         return ompi_coll_base_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
 411                                                     comm, module, segsize, max_requests);
 412     }
 413     if (communicator_size > (a4 * message_size + b4)) {
 414         /* Pipeline_32K */
 415         segsize = 32*1024;
 416     } else {
 417         /* Pipeline_64K */
 418         segsize = 64*1024;
 419     }
 420     return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 421                                                 segsize, max_requests);
 422 
 423 #if 0
 424     /* for small messages use linear algorithm */
 425     if (message_size <= 4096) {
 426         segsize = 0;
 427         fanout = communicator_size - 1;
 428         /* when linear implemented or taken from basic put here, right now using chain as a linear system */
 429         /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
 430         return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
 431     }
 432     if (message_size < 524288) {
 433         if (message_size <= 65536 ) {
 434             segsize = 32768;
 435             fanout = 8;
 436         } else {
 437             segsize = 1024;
 438             fanout = communicator_size/2;
 439         }
 440         /* later swap this for a binary tree */
 441         /*         fanout = 2; */
 442         return ompi_coll_base_reduce_intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 443                                                  segsize, fanout, max_requests);
 444     }
 445     segsize = 1024;
 446     return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 447                                                 segsize, max_requests);
 448 #endif  /* 0 */
 449 }
 450 
 451 /*
 452  *      reduce_scatter_intra_dec
 453  *
 454  *      Function:       - seletects reduce_scatter algorithm to use
 455  *      Accepts:        - same arguments as MPI_Reduce_scatter()
 456  *      Returns:        - MPI_SUCCESS or error code (passed from
 457  *                        the reduce scatter implementation)
 458  */
 459 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf,
 460                                                     const int *rcounts,
 461                                                     struct ompi_datatype_t *dtype,
 462                                                     struct ompi_op_t *op,
 463                                                     struct ompi_communicator_t *comm,
 464                                                     mca_coll_base_module_t *module)
 465 {
 466     int comm_size, i, pow2;
 467     size_t total_message_size, dsize;
 468     const double a = 0.0012;
 469     const double b = 8.0;
 470     const size_t small_message_size = 12 * 1024;
 471     const size_t large_message_size = 256 * 1024;
 472 
 473     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed"));
 474 
 475     comm_size = ompi_comm_size(comm);
 476     /* We need data size for decision function */
 477     ompi_datatype_type_size(dtype, &dsize);
 478     total_message_size = 0;
 479     for (i = 0; i < comm_size; i++) {
 480         total_message_size += rcounts[i];
 481     }
 482 
 483     if( !ompi_op_is_commute(op) ) {
 484         return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
 485                                                                   dtype, op,
 486                                                                   comm, module);
 487     }
 488 
 489     total_message_size *= dsize;
 490 
 491     /* compute the nearest power of 2 */
 492     pow2 = opal_next_poweroftwo_inclusive (comm_size);
 493 
 494     if ((total_message_size <= small_message_size) ||
 495         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 496         (comm_size >= a * total_message_size + b)) {
 497         return
 498             ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 499                                                                        dtype, op,
 500                                                                        comm, module);
 501     }
 502     return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 503                                                      dtype, op,
 504                                                      comm, module);
 505 }
 506 
 507 /*
 508  *      reduce_scatter_block_intra_dec
 509  *
 510  *      Function:       - seletects reduce_scatter_block algorithm to use
 511  *      Accepts:        - same arguments as MPI_Reduce_scatter_block()
 512  *      Returns:        - MPI_SUCCESS or error code (passed from
 513  *                        the reduce scatter implementation)
 514  */
 515 int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void *rbuf,
 516                                                          int rcount,
 517                                                          struct ompi_datatype_t *dtype,
 518                                                          struct ompi_op_t *op,
 519                                                          struct ompi_communicator_t *comm,
 520                                                          mca_coll_base_module_t *module)
 521 {
 522     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed"));
 523     return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount,
 524                                                             dtype, op, comm, module);
 525 }
 526 
 527 /*
 528  *      allgather_intra_dec
 529  *
 530  *      Function:       - seletects allgather algorithm to use
 531  *      Accepts:        - same arguments as MPI_Allgather()
 532  *      Returns:        - MPI_SUCCESS or error code, passed from corresponding
 533  *                        internal allgather function.
 534  */
 535 
 536 int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, int scount,
 537                                               struct ompi_datatype_t *sdtype,
 538                                               void* rbuf, int rcount,
 539                                               struct ompi_datatype_t *rdtype,
 540                                               struct ompi_communicator_t *comm,
 541                                               mca_coll_base_module_t *module)
 542 {
 543     int communicator_size, pow2_size;
 544     size_t dsize, total_dsize;
 545 
 546     communicator_size = ompi_comm_size(comm);
 547 
 548     /* Special case for 2 processes */
 549     if (communicator_size == 2) {
 550         return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
 551                                                         rbuf, rcount, rdtype,
 552                                                         comm, module);
 553     }
 554 
 555     /* Determine complete data size */
 556     if (MPI_IN_PLACE != sbuf) {
 557         ompi_datatype_type_size(sdtype, &dsize);
 558     } else {
 559         ompi_datatype_type_size(rdtype, &dsize);
 560     }
 561     total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
 562 
 563     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
 564                  " rank %d com_size %d msg_length %lu",
 565                  ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
 566 
 567     pow2_size = opal_next_poweroftwo_inclusive (communicator_size);
 568 
 569     /* Decision based on MX 2Gb results from Grig cluster at
 570        The University of Tennesse, Knoxville
 571        - if total message size is less than 50KB use either bruck or
 572        recursive doubling for non-power of two and power of two nodes,
 573        respectively.
 574        - else use ring and neighbor exchange algorithms for odd and even
 575        number of nodes, respectively.
 576     */
 577     if (total_dsize < 50000) {
 578         if (pow2_size == communicator_size) {
 579             return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 580                                                                     rbuf, rcount, rdtype,
 581                                                                     comm, module);
 582         } else {
 583             return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
 584                                                         rbuf, rcount, rdtype,
 585                                                         comm, module);
 586         }
 587     } else {
 588         if (communicator_size % 2) {
 589             return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
 590                                                        rbuf, rcount, rdtype,
 591                                                        comm, module);
 592         } else {
 593             return  ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
 594                                                                     rbuf, rcount, rdtype,
 595                                                                     comm, module);
 596         }
 597     }
 598 
 599 #if defined(USE_MPICH2_DECISION)
 600     /* Decision as in MPICH-2
 601        presented in Thakur et.al. "Optimization of Collective Communication
 602        Operations in MPICH", International Journal of High Performance Computing
 603        Applications, Vol. 19, No. 1, 49-66 (2005)
 604        - for power-of-two processes and small and medium size messages
 605        (up to 512KB) use recursive doubling
 606        - for non-power-of-two processes and small messages (80KB) use bruck,
 607        - for everything else use ring.
 608     */
 609     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 610         return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 611                                                                 rbuf, rcount, rdtype,
 612                                                                 comm, module);
 613     } else if (total_dsize <= 81920) {
 614         return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
 615                                                     rbuf, rcount, rdtype,
 616                                                     comm, module);
 617     }
 618     return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
 619                                                rbuf, rcount, rdtype,
 620                                                comm, module);
 621 #endif  /* defined(USE_MPICH2_DECISION) */
 622 }
 623 
 624 /*
 625  *      allgatherv_intra_dec
 626  *
 627  *      Function:       - seletects allgatherv algorithm to use
 628  *      Accepts:        - same arguments as MPI_Allgatherv()
 629  *      Returns:        - MPI_SUCCESS or error code, passed from corresponding
 630  *                        internal allgatherv function.
 631  */
 632 
 633 int ompi_coll_tuned_allgatherv_intra_dec_fixed(const void *sbuf, int scount,
 634                                                struct ompi_datatype_t *sdtype,
 635                                                void* rbuf, const int *rcounts,
 636                                                const int *rdispls,
 637                                                struct ompi_datatype_t *rdtype,
 638                                                struct ompi_communicator_t *comm,
 639                                                mca_coll_base_module_t *module)
 640 {
 641     int i;
 642     int communicator_size;
 643     size_t dsize, total_dsize;
 644 
 645     communicator_size = ompi_comm_size(comm);
 646 
 647     /* Special case for 2 processes */
 648     if (communicator_size == 2) {
 649         return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
 650                                                          rbuf, rcounts, rdispls, rdtype,
 651                                                          comm, module);
 652     }
 653 
 654     /* Determine complete data size */
 655     if (MPI_IN_PLACE != sbuf) {
 656         ompi_datatype_type_size(sdtype, &dsize);
 657     } else {
 658         ompi_datatype_type_size(rdtype, &dsize);
 659     }
 660 
 661     total_dsize = 0;
 662     for (i = 0; i < communicator_size; i++) {
 663         total_dsize += dsize * (ptrdiff_t)rcounts[i];
 664     }
 665 
 666     OPAL_OUTPUT((ompi_coll_tuned_stream,
 667                  "ompi_coll_tuned_allgatherv_intra_dec_fixed"
 668                  " rank %d com_size %d msg_length %lu",
 669                  ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
 670 
 671     /* Decision based on allgather decision.   */
 672     if (total_dsize < 50000) {
 673         return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
 674                                                      rbuf, rcounts, rdispls, rdtype,
 675                                                      comm, module);
 676     } else {
 677         if (communicator_size % 2) {
 678             return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
 679                                                         rbuf, rcounts, rdispls, rdtype,
 680                                                         comm, module);
 681         } else {
 682             return  ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
 683                                                                      rbuf, rcounts, rdispls, rdtype,
 684                                                                      comm, module);
 685         }
 686     }
 687 }
 688 
 689 /*
 690  *      gather_intra_dec
 691  *
 692  *      Function:       - seletects gather algorithm to use
 693  *      Accepts:        - same arguments as MPI_Gather()
 694  *      Returns:        - MPI_SUCCESS or error code, passed from corresponding
 695  *                        internal allgather function.
 696  */
 697 
 698 int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount,
 699                                            struct ompi_datatype_t *sdtype,
 700                                            void* rbuf, int rcount,
 701                                            struct ompi_datatype_t *rdtype,
 702                                            int root,
 703                                            struct ompi_communicator_t *comm,
 704                                            mca_coll_base_module_t *module)
 705 {
 706     const int large_segment_size = 32768;
 707     const int small_segment_size = 1024;
 708 
 709     const size_t large_block_size = 92160;
 710     const size_t intermediate_block_size = 6000;
 711     const size_t small_block_size = 1024;
 712 
 713     const int large_communicator_size = 60;
 714     const int small_communicator_size = 10;
 715 
 716     int communicator_size, rank;
 717     size_t dsize, block_size;
 718 
 719     OPAL_OUTPUT((ompi_coll_tuned_stream,
 720                  "ompi_coll_tuned_gather_intra_dec_fixed"));
 721 
 722     communicator_size = ompi_comm_size(comm);
 723     rank = ompi_comm_rank(comm);
 724 
 725     /* Determine block size */
 726     if (rank == root) {
 727         ompi_datatype_type_size(rdtype, &dsize);
 728         block_size = dsize * (ptrdiff_t)rcount;
 729     } else {
 730         ompi_datatype_type_size(sdtype, &dsize);
 731         block_size = dsize * (ptrdiff_t)scount;
 732     }
 733 
 734     if (block_size > large_block_size) {
 735         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
 736                                                        rbuf, rcount, rdtype,
 737                                                        root, comm, module,
 738                                                        large_segment_size);
 739 
 740     } else if (block_size > intermediate_block_size) {
 741         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
 742                                                        rbuf, rcount, rdtype,
 743                                                        root, comm, module,
 744                                                        small_segment_size);
 745 
 746     } else if ((communicator_size > large_communicator_size) ||
 747                ((communicator_size > small_communicator_size) &&
 748                 (block_size < small_block_size))) {
 749         return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
 750                                                     rbuf, rcount, rdtype,
 751                                                     root, comm, module);
 752     }
 753     /* Otherwise, use basic linear */
 754     return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
 755                                                     rbuf, rcount, rdtype,
 756                                                     root, comm, module);
 757 }
 758 
 759 /*
 760  *      scatter_intra_dec
 761  *
 762  *      Function:       - seletects scatter algorithm to use
 763  *      Accepts:        - same arguments as MPI_Scatter()
 764  *      Returns:        - MPI_SUCCESS or error code, passed from corresponding
 765  *                        internal allgather function.
 766  */
 767 
 768 int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
 769                                             struct ompi_datatype_t *sdtype,
 770                                             void* rbuf, int rcount,
 771                                             struct ompi_datatype_t *rdtype,
 772                                             int root, struct ompi_communicator_t *comm,
 773                                             mca_coll_base_module_t *module)
 774 {
 775     const size_t small_block_size = 300;
 776     const int small_comm_size = 10;
 777     int communicator_size, rank;
 778     size_t dsize, block_size;
 779 
 780     OPAL_OUTPUT((ompi_coll_tuned_stream,
 781                  "ompi_coll_tuned_scatter_intra_dec_fixed"));
 782 
 783     communicator_size = ompi_comm_size(comm);
 784     rank = ompi_comm_rank(comm);
 785     /* Determine block size */
 786     if (root == rank) {
 787         ompi_datatype_type_size(sdtype, &dsize);
 788         block_size = dsize * (ptrdiff_t)scount;
 789     } else {
 790         ompi_datatype_type_size(rdtype, &dsize);
 791         block_size = dsize * (ptrdiff_t)rcount;
 792     }
 793 
 794     if ((communicator_size > small_comm_size) &&
 795         (block_size < small_block_size)) {
 796         return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
 797                                                      rbuf, rcount, rdtype,
 798                                                      root, comm, module);
 799     }
 800     return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
 801                                                      rbuf, rcount, rdtype,
 802                                                      root, comm, module);
 803 }
/* [<][>][^][v][top][bottom][index][help] */
root/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c

DEFINITIONS