This source file includes following definitions.
- ompi_coll_tuned_allreduce_intra_dec_fixed
- ompi_coll_tuned_alltoall_intra_dec_fixed
- ompi_coll_tuned_alltoallv_intra_dec_fixed
- ompi_coll_tuned_barrier_intra_dec_fixed
- ompi_coll_tuned_bcast_intra_dec_fixed
- ompi_coll_tuned_reduce_intra_dec_fixed
- ompi_coll_tuned_reduce_scatter_intra_dec_fixed
- ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed
- ompi_coll_tuned_allgather_intra_dec_fixed
- ompi_coll_tuned_allgatherv_intra_dec_fixed
- ompi_coll_tuned_gather_intra_dec_fixed
- ompi_coll_tuned_scatter_intra_dec_fixed
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 #include "ompi_config.h"
  26 
  27 #include "mpi.h"
  28 #include "opal/util/bit_ops.h"
  29 #include "ompi/datatype/ompi_datatype.h"
  30 #include "ompi/communicator/communicator.h"
  31 #include "ompi/mca/coll/coll.h"
  32 #include "ompi/mca/coll/base/coll_tags.h"
  33 #include "ompi/op/op.h"
  34 #include "coll_tuned.h"
  35 
  36 
  37 
  38 
  39 
  40 
  41 
  42 
  43 int
  44 ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, int count,
  45                                           struct ompi_datatype_t *dtype,
  46                                           struct ompi_op_t *op,
  47                                           struct ompi_communicator_t *comm,
  48                                           mca_coll_base_module_t *module)
  49 {
  50     size_t dsize, block_dsize;
  51     int comm_size = ompi_comm_size(comm);
  52     const size_t intermediate_message = 10000;
  53     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed"));
  54 
  55     
  56 
  57 
  58 
  59 
  60 
  61 
  62     ompi_datatype_type_size(dtype, &dsize);
  63     block_dsize = dsize * (ptrdiff_t)count;
  64 
  65     if (block_dsize < intermediate_message) {
  66         return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
  67                                                                  count, dtype,
  68                                                                  op, comm, module));
  69     }
  70 
  71     if( ompi_op_is_commute(op) && (count > comm_size) ) {
  72         const size_t segment_size = 1 << 20; 
  73         if (((size_t)comm_size * (size_t)segment_size >= block_dsize)) {
  74             return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype,
  75                                                         op, comm, module));
  76         } else {
  77             return (ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf,
  78                                                                   count, dtype,
  79                                                                   op, comm, module,
  80                                                                   segment_size));
  81         }
  82     }
  83 
  84     return (ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count,
  85                                                           dtype, op, comm, module));
  86 }
  87 
  88 
  89 
  90 
  91 
  92 
  93 
  94 
  95 
  96 int ompi_coll_tuned_alltoall_intra_dec_fixed(const void *sbuf, int scount,
  97                                              struct ompi_datatype_t *sdtype,
  98                                              void* rbuf, int rcount,
  99                                              struct ompi_datatype_t *rdtype,
 100                                              struct ompi_communicator_t *comm,
 101                                              mca_coll_base_module_t *module)
 102 {
 103     int communicator_size;
 104     size_t dsize, block_dsize;
 105 #if 0
 106     size_t total_dsize;
 107 #endif
 108 
 109     communicator_size = ompi_comm_size(comm);
 110 
 111     
 112     if (communicator_size==2) {
 113         return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype,
 114                                                        rbuf, rcount, rdtype,
 115                                                        comm, module);
 116     }
 117 
 118     
 119 
 120 
 121     
 122     if (MPI_IN_PLACE != sbuf) {
 123         ompi_datatype_type_size(sdtype, &dsize);
 124     } else {
 125         ompi_datatype_type_size(rdtype, &dsize);
 126     }
 127     block_dsize = dsize * (ptrdiff_t)scount;
 128 
 129     if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_small_msg)
 130                                               && (communicator_size > 12)) {
 131         return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype,
 132                                                    rbuf, rcount, rdtype,
 133                                                    comm, module);
 134 
 135     } else if (block_dsize < (size_t) ompi_coll_tuned_alltoall_intermediate_msg) {
 136         return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
 137                                                           rbuf, rcount, rdtype,
 138                                                           comm, module);
 139     }
 140 
 141     return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,
 142                                                   rbuf, rcount, rdtype,
 143                                                   comm, module);
 144 
 145 #if 0
 146     
 147 
 148     
 149     ompi_datatype_type_size(sdtype, &dsize);
 150     total_dsize = dsize * scount * communicator_size;   
 151 
 152     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed rank %d com_size %d msg_length %ld",
 153                  ompi_comm_rank(comm), communicator_size, total_dsize));
 154 
 155     if (communicator_size >= 12 && total_dsize <= 768) {
 156         return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 157     }
 158     if (total_dsize <= 131072) {
 159         return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 160     }
 161     return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
 162 #endif
 163 }
 164 
 165 
 166 
 167 
 168 
 169 
 170 int ompi_coll_tuned_alltoallv_intra_dec_fixed(const void *sbuf, const int *scounts, const int *sdisps,
 171                                               struct ompi_datatype_t *sdtype,
 172                                               void *rbuf, const int *rcounts, const int *rdisps,
 173                                               struct ompi_datatype_t *rdtype,
 174                                               struct ompi_communicator_t *comm,
 175                                               mca_coll_base_module_t *module)
 176 {
 177     
 178     return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
 179                                                    rbuf, rcounts, rdisps,rdtype,
 180                                                    comm, module);
 181 }
 182 
 183 
 184 
 185 
 186 
 187 
 188 
 189 
 190 
 191 int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
 192                                             mca_coll_base_module_t *module)
 193 {
 194     int communicator_size = ompi_comm_size(comm);
 195 
 196     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_dec_fixed com_size %d",
 197                  communicator_size));
 198 
 199     if( 2 == communicator_size )
 200         return ompi_coll_base_barrier_intra_two_procs(comm, module);
 201     
 202 
 203 
 204 
 205 
 206     {
 207         bool has_one = false;
 208         for( ; communicator_size > 0; communicator_size >>= 1 ) {
 209             if( communicator_size & 0x1 ) {
 210                 if( has_one )
 211                     return ompi_coll_base_barrier_intra_bruck(comm, module);
 212                 has_one = true;
 213             }
 214         }
 215     }
 216     return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
 217 }
 218 
 219 
 220 
 221 
 222 
 223 
 224 
 225 
 226 
 227 int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
 228                                           struct ompi_datatype_t *datatype, int root,
 229                                           struct ompi_communicator_t *comm,
 230                                           mca_coll_base_module_t *module)
 231 {
 232     
 233 
 234     const size_t small_message_size = 2048;
 235     const size_t intermediate_message_size = 370728;
 236     const double a_p16  = 3.2118e-6; 
 237     const double b_p16  = 8.7936;
 238     const double a_p64  = 2.3679e-6; 
 239     const double b_p64  = 1.1787;
 240     const double a_p128 = 1.6134e-6; 
 241     const double b_p128 = 2.1102;
 242 
 243     int communicator_size;
 244     int segsize = 0;
 245     size_t message_size, dsize;
 246 
 247     communicator_size = ompi_comm_size(comm);
 248 
 249     
 250     ompi_datatype_type_size(datatype, &dsize);
 251     message_size = dsize * (unsigned long)count;   
 252 
 253     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_dec_fixed"
 254                  " root %d rank %d com_size %d msg_length %lu",
 255                  root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
 256 
 257     
 258 
 259     if ((message_size < small_message_size) || (count <= 1)) {
 260         
 261         segsize = 0;
 262         return  ompi_coll_base_bcast_intra_binomial(buff, count, datatype,
 263                                                     root, comm, module,
 264                                                     segsize);
 265 
 266     } else if (message_size < intermediate_message_size) {
 267         
 268         segsize = 1024;
 269         return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
 270                                                         root, comm, module,
 271                                                         segsize);
 272 
 273     }
 274     
 275     else if (communicator_size < (a_p128 * message_size + b_p128)) {
 276         
 277         segsize = 1024  << 7;
 278         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 279                                                    root, comm, module,
 280                                                    segsize);
 281 
 282     } else if (communicator_size < 13) {
 283         
 284         segsize = 1024 << 3;
 285         return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
 286                                                         root, comm, module,
 287                                                         segsize);
 288 
 289     } else if (communicator_size < (a_p64 * message_size + b_p64)) {
 290         
 291         segsize = 1024 << 6;
 292         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 293                                                    root, comm, module,
 294                                                    segsize);
 295 
 296     } else if (communicator_size < (a_p16 * message_size + b_p16)) {
 297         
 298         segsize = 1024 << 4;
 299         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 300                                                    root, comm, module,
 301                                                    segsize);
 302 
 303     }
 304 
 305     
 306     segsize = 1024 << 3;
 307     return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
 308                                                root, comm, module,
 309                                                segsize);
 310 #if 0
 311     
 312 
 313     if (communicator_size  < 4) {
 314         return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
 315     }
 316     if (communicator_size == 4) {
 317         if (message_size < 524288) segsize = 0;
 318         else segsize = 16384;
 319         return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 320     }
 321     if (communicator_size <= 8 && message_size < 4096) {
 322         return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
 323     }
 324     if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
 325         segsize = 16384;
 326         return  ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 327     }
 328     if (message_size >= 524288) {
 329         segsize = 16384;
 330         return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype, root, comm, module, segsize);
 331     }
 332     segsize = 0;
 333     
 334     
 335     return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
 336 #endif  
 337 }
 338 
 339 
 340 
 341 
 342 
 343 
 344 
 345 
 346 
 347 int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
 348                                             int count, struct ompi_datatype_t* datatype,
 349                                             struct ompi_op_t* op, int root,
 350                                             struct ompi_communicator_t* comm,
 351                                             mca_coll_base_module_t *module)
 352 {
 353     int communicator_size, segsize = 0;
 354     size_t message_size, dsize;
 355     const double a1 =  0.6016 / 1024.0; 
 356     const double b1 =  1.3496;
 357     const double a2 =  0.0410 / 1024.0; 
 358     const double b2 =  9.7128;
 359     const double a3 =  0.0422 / 1024.0; 
 360     const double b3 =  1.1614;
 361     const double a4 =  0.0033 / 1024.0; 
 362     const double b4 =  1.6761;
 363 
 364     const int max_requests = 0; 
 365 
 366     communicator_size = ompi_comm_size(comm);
 367 
 368     
 369     ompi_datatype_type_size(datatype, &dsize);
 370     message_size = dsize * (ptrdiff_t)count;   
 371 
 372     
 373 
 374 
 375 
 376     if( !ompi_op_is_commute(op) ) {
 377         if ((communicator_size < 12) && (message_size < 2048)) {
 378             return ompi_coll_base_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
 379         }
 380         return ompi_coll_base_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
 381                                                              0, max_requests);
 382     }
 383 
 384     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
 385                  "root %d rank %d com_size %d msg_length %lu",
 386                  root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
 387 
 388     if ((communicator_size < 8) && (message_size < 512)){
 389         
 390         return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
 391     } else if (((communicator_size < 8) && (message_size < 20480)) ||
 392                (message_size < 2048) || (count <= 1)) {
 393         
 394         segsize = 0;
 395         return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 396                                                      segsize, max_requests);
 397     } else if (communicator_size > (a1 * message_size + b1)) {
 398         
 399         segsize = 1024;
 400         return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 401                                                      segsize, max_requests);
 402     } else if (communicator_size > (a2 * message_size + b2)) {
 403         
 404         segsize = 1024;
 405         return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 406                                                     segsize, max_requests);
 407     } else if (communicator_size > (a3 * message_size + b3)) {
 408         
 409         segsize = 32*1024;
 410         return ompi_coll_base_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
 411                                                     comm, module, segsize, max_requests);
 412     }
 413     if (communicator_size > (a4 * message_size + b4)) {
 414         
 415         segsize = 32*1024;
 416     } else {
 417         
 418         segsize = 64*1024;
 419     }
 420     return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 421                                                 segsize, max_requests);
 422 
 423 #if 0
 424     
 425     if (message_size <= 4096) {
 426         segsize = 0;
 427         fanout = communicator_size - 1;
 428         
 429         
 430         return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
 431     }
 432     if (message_size < 524288) {
 433         if (message_size <= 65536 ) {
 434             segsize = 32768;
 435             fanout = 8;
 436         } else {
 437             segsize = 1024;
 438             fanout = communicator_size/2;
 439         }
 440         
 441         
 442         return ompi_coll_base_reduce_intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 443                                                  segsize, fanout, max_requests);
 444     }
 445     segsize = 1024;
 446     return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
 447                                                 segsize, max_requests);
 448 #endif  
 449 }
 450 
 451 
 452 
 453 
 454 
 455 
 456 
 457 
 458 
 459 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf,
 460                                                     const int *rcounts,
 461                                                     struct ompi_datatype_t *dtype,
 462                                                     struct ompi_op_t *op,
 463                                                     struct ompi_communicator_t *comm,
 464                                                     mca_coll_base_module_t *module)
 465 {
 466     int comm_size, i, pow2;
 467     size_t total_message_size, dsize;
 468     const double a = 0.0012;
 469     const double b = 8.0;
 470     const size_t small_message_size = 12 * 1024;
 471     const size_t large_message_size = 256 * 1024;
 472 
 473     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed"));
 474 
 475     comm_size = ompi_comm_size(comm);
 476     
 477     ompi_datatype_type_size(dtype, &dsize);
 478     total_message_size = 0;
 479     for (i = 0; i < comm_size; i++) {
 480         total_message_size += rcounts[i];
 481     }
 482 
 483     if( !ompi_op_is_commute(op) ) {
 484         return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
 485                                                                   dtype, op,
 486                                                                   comm, module);
 487     }
 488 
 489     total_message_size *= dsize;
 490 
 491     
 492     pow2 = opal_next_poweroftwo_inclusive (comm_size);
 493 
 494     if ((total_message_size <= small_message_size) ||
 495         ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
 496         (comm_size >= a * total_message_size + b)) {
 497         return
 498             ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
 499                                                                        dtype, op,
 500                                                                        comm, module);
 501     }
 502     return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
 503                                                      dtype, op,
 504                                                      comm, module);
 505 }
 506 
 507 
 508 
 509 
 510 
 511 
 512 
 513 
 514 
 515 int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void *rbuf,
 516                                                          int rcount,
 517                                                          struct ompi_datatype_t *dtype,
 518                                                          struct ompi_op_t *op,
 519                                                          struct ompi_communicator_t *comm,
 520                                                          mca_coll_base_module_t *module)
 521 {
 522     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed"));
 523     return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount,
 524                                                             dtype, op, comm, module);
 525 }
 526 
 527 
 528 
 529 
 530 
 531 
 532 
 533 
 534 
 535 
 536 int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, int scount,
 537                                               struct ompi_datatype_t *sdtype,
 538                                               void* rbuf, int rcount,
 539                                               struct ompi_datatype_t *rdtype,
 540                                               struct ompi_communicator_t *comm,
 541                                               mca_coll_base_module_t *module)
 542 {
 543     int communicator_size, pow2_size;
 544     size_t dsize, total_dsize;
 545 
 546     communicator_size = ompi_comm_size(comm);
 547 
 548     
 549     if (communicator_size == 2) {
 550         return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
 551                                                         rbuf, rcount, rdtype,
 552                                                         comm, module);
 553     }
 554 
 555     
 556     if (MPI_IN_PLACE != sbuf) {
 557         ompi_datatype_type_size(sdtype, &dsize);
 558     } else {
 559         ompi_datatype_type_size(rdtype, &dsize);
 560     }
 561     total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
 562 
 563     OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
 564                  " rank %d com_size %d msg_length %lu",
 565                  ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
 566 
 567     pow2_size = opal_next_poweroftwo_inclusive (communicator_size);
 568 
 569     
 570 
 571 
 572 
 573 
 574 
 575 
 576 
 577     if (total_dsize < 50000) {
 578         if (pow2_size == communicator_size) {
 579             return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 580                                                                     rbuf, rcount, rdtype,
 581                                                                     comm, module);
 582         } else {
 583             return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
 584                                                         rbuf, rcount, rdtype,
 585                                                         comm, module);
 586         }
 587     } else {
 588         if (communicator_size % 2) {
 589             return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
 590                                                        rbuf, rcount, rdtype,
 591                                                        comm, module);
 592         } else {
 593             return  ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
 594                                                                     rbuf, rcount, rdtype,
 595                                                                     comm, module);
 596         }
 597     }
 598 
 599 #if defined(USE_MPICH2_DECISION)
 600     
 601 
 602 
 603 
 604 
 605 
 606 
 607 
 608 
 609     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 610         return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
 611                                                                 rbuf, rcount, rdtype,
 612                                                                 comm, module);
 613     } else if (total_dsize <= 81920) {
 614         return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
 615                                                     rbuf, rcount, rdtype,
 616                                                     comm, module);
 617     }
 618     return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
 619                                                rbuf, rcount, rdtype,
 620                                                comm, module);
 621 #endif  
 622 }
 623 
 624 
 625 
 626 
 627 
 628 
 629 
 630 
 631 
 632 
 633 int ompi_coll_tuned_allgatherv_intra_dec_fixed(const void *sbuf, int scount,
 634                                                struct ompi_datatype_t *sdtype,
 635                                                void* rbuf, const int *rcounts,
 636                                                const int *rdispls,
 637                                                struct ompi_datatype_t *rdtype,
 638                                                struct ompi_communicator_t *comm,
 639                                                mca_coll_base_module_t *module)
 640 {
 641     int i;
 642     int communicator_size;
 643     size_t dsize, total_dsize;
 644 
 645     communicator_size = ompi_comm_size(comm);
 646 
 647     
 648     if (communicator_size == 2) {
 649         return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
 650                                                          rbuf, rcounts, rdispls, rdtype,
 651                                                          comm, module);
 652     }
 653 
 654     
 655     if (MPI_IN_PLACE != sbuf) {
 656         ompi_datatype_type_size(sdtype, &dsize);
 657     } else {
 658         ompi_datatype_type_size(rdtype, &dsize);
 659     }
 660 
 661     total_dsize = 0;
 662     for (i = 0; i < communicator_size; i++) {
 663         total_dsize += dsize * (ptrdiff_t)rcounts[i];
 664     }
 665 
 666     OPAL_OUTPUT((ompi_coll_tuned_stream,
 667                  "ompi_coll_tuned_allgatherv_intra_dec_fixed"
 668                  " rank %d com_size %d msg_length %lu",
 669                  ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
 670 
 671     
 672     if (total_dsize < 50000) {
 673         return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
 674                                                      rbuf, rcounts, rdispls, rdtype,
 675                                                      comm, module);
 676     } else {
 677         if (communicator_size % 2) {
 678             return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
 679                                                         rbuf, rcounts, rdispls, rdtype,
 680                                                         comm, module);
 681         } else {
 682             return  ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
 683                                                                      rbuf, rcounts, rdispls, rdtype,
 684                                                                      comm, module);
 685         }
 686     }
 687 }
 688 
 689 
 690 
 691 
 692 
 693 
 694 
 695 
 696 
 697 
 698 int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount,
 699                                            struct ompi_datatype_t *sdtype,
 700                                            void* rbuf, int rcount,
 701                                            struct ompi_datatype_t *rdtype,
 702                                            int root,
 703                                            struct ompi_communicator_t *comm,
 704                                            mca_coll_base_module_t *module)
 705 {
 706     const int large_segment_size = 32768;
 707     const int small_segment_size = 1024;
 708 
 709     const size_t large_block_size = 92160;
 710     const size_t intermediate_block_size = 6000;
 711     const size_t small_block_size = 1024;
 712 
 713     const int large_communicator_size = 60;
 714     const int small_communicator_size = 10;
 715 
 716     int communicator_size, rank;
 717     size_t dsize, block_size;
 718 
 719     OPAL_OUTPUT((ompi_coll_tuned_stream,
 720                  "ompi_coll_tuned_gather_intra_dec_fixed"));
 721 
 722     communicator_size = ompi_comm_size(comm);
 723     rank = ompi_comm_rank(comm);
 724 
 725     
 726     if (rank == root) {
 727         ompi_datatype_type_size(rdtype, &dsize);
 728         block_size = dsize * (ptrdiff_t)rcount;
 729     } else {
 730         ompi_datatype_type_size(sdtype, &dsize);
 731         block_size = dsize * (ptrdiff_t)scount;
 732     }
 733 
 734     if (block_size > large_block_size) {
 735         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
 736                                                        rbuf, rcount, rdtype,
 737                                                        root, comm, module,
 738                                                        large_segment_size);
 739 
 740     } else if (block_size > intermediate_block_size) {
 741         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
 742                                                        rbuf, rcount, rdtype,
 743                                                        root, comm, module,
 744                                                        small_segment_size);
 745 
 746     } else if ((communicator_size > large_communicator_size) ||
 747                ((communicator_size > small_communicator_size) &&
 748                 (block_size < small_block_size))) {
 749         return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
 750                                                     rbuf, rcount, rdtype,
 751                                                     root, comm, module);
 752     }
 753     
 754     return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
 755                                                     rbuf, rcount, rdtype,
 756                                                     root, comm, module);
 757 }
 758 
 759 
 760 
 761 
 762 
 763 
 764 
 765 
 766 
 767 
 768 int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
 769                                             struct ompi_datatype_t *sdtype,
 770                                             void* rbuf, int rcount,
 771                                             struct ompi_datatype_t *rdtype,
 772                                             int root, struct ompi_communicator_t *comm,
 773                                             mca_coll_base_module_t *module)
 774 {
 775     const size_t small_block_size = 300;
 776     const int small_comm_size = 10;
 777     int communicator_size, rank;
 778     size_t dsize, block_size;
 779 
 780     OPAL_OUTPUT((ompi_coll_tuned_stream,
 781                  "ompi_coll_tuned_scatter_intra_dec_fixed"));
 782 
 783     communicator_size = ompi_comm_size(comm);
 784     rank = ompi_comm_rank(comm);
 785     
 786     if (root == rank) {
 787         ompi_datatype_type_size(sdtype, &dsize);
 788         block_size = dsize * (ptrdiff_t)scount;
 789     } else {
 790         ompi_datatype_type_size(rdtype, &dsize);
 791         block_size = dsize * (ptrdiff_t)rcount;
 792     }
 793 
 794     if ((communicator_size > small_comm_size) &&
 795         (block_size < small_block_size)) {
 796         return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
 797                                                      rbuf, rcount, rdtype,
 798                                                      root, comm, module);
 799     }
 800     return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
 801                                                      rbuf, rcount, rdtype,
 802                                                      root, comm, module);
 803 }