ompi/mca/coll/libnbc/nbc

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
NBC_Reduce_args_compare
nbc_reduce_init
ompi_coll_libnbc_ireduce
nbc_reduce_inter_init
ompi_coll_libnbc_ireduce_inter
red_sched_binomial
red_sched_chain
red_sched_linear
red_sched_redscat_gather
ompi_coll_libnbc_reduce_init
ompi_coll_libnbc_reduce_inter_init
   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
   9  *                         reserved.
  10  * Copyright (c) 2014-2018 Research Organization for Information Science
  11  *                         and Technology (RIST).  All rights reserved.
  12  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  13  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  19  *
  20  */
  21 
  22 #include "ompi_config.h"
  23 #include "opal/align.h"
  24 #include "opal/util/bit_ops.h"
  25 #include "ompi/op/op.h"
  26 
  27 #include "nbc_internal.h"
  28 
  29 static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, char tmpredbuf, int count, MPI_Datatype datatype,
  30                                       MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
  31 static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
  32                                    MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize);
  33 
  34 static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype,
  35                                     MPI_Op op, NBC_Schedule *schedule);
  36 static inline int red_sched_redscat_gather(
  37     int rank, int comm_size, int root, const void *sbuf, void *rbuf,
  38     char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace,
  39     NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm);
  40 
  41 #ifdef NBC_CACHE_SCHEDULE
  42 /* tree comparison function for schedule cache */
  43 int NBC_Reduce_args_compare(NBC_Reduce_args *a, NBC_Reduce_args *b, void *param) {
  44   if ((a->sendbuf == b->sendbuf) &&
  45       (a->recvbuf == b->recvbuf) &&
  46       (a->count == b->count) &&
  47       (a->datatype == b->datatype) &&
  48       (a->op == b->op) &&
  49       (a->root == b->root)) {
  50     return 0;
  51   }
  52 
  53   if (a->sendbuf < b->sendbuf) {
  54     return -1;
  55   }
  56 
  57   return 1;
  58 }
  59 #endif
  60 
  61 /* the non-blocking reduce */
  62 static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
  63                            MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request,
  64                            struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  65   int rank, p, res, segsize;
  66   size_t size;
  67   MPI_Aint ext;
  68   NBC_Schedule *schedule;
  69   char *redbuf=NULL, inplace;
  70   void *tmpbuf;
  71   char tmpredbuf = 0;
  72   enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN, NBC_RED_REDSCAT_GATHER} alg;
  73   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  74   ptrdiff_t span, gap;
  75 
  76   NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  77 
  78   rank = ompi_comm_rank (comm);
  79   p = ompi_comm_size (comm);
  80 
  81   res = ompi_datatype_type_extent(datatype, &ext);
  82   if (MPI_SUCCESS != res) {
  83     NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
  84     return res;
  85   }
  86 
  87   res = ompi_datatype_type_size(datatype, &size);
  88   if (MPI_SUCCESS != res) {
  89     NBC_Error("MPI Error in ompi_datatype_type_size() (%i)", res);
  90     return res;
  91   }
  92 
  93   /* only one node -> copy data */
  94   if (1 == p && (!persistent || inplace)) {
  95     if (!inplace) {
  96       res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm);
  97       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
  98         return res;
  99       }
 100     }
 101     return nbc_get_noop_request(persistent, request);
 102   }
 103 
 104   span = opal_datatype_span(&datatype->super, count, &gap);
 105 
 106   /* algorithm selection */
 107   int nprocs_pof2 = opal_next_poweroftwo(p) >> 1;
 108   if (libnbc_ireduce_algorithm == 0) {
 109     if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) {
 110       alg = NBC_RED_REDSCAT_GATHER;
 111     } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) {
 112       alg = NBC_RED_BINOMIAL;
 113     } else {
 114       alg = NBC_RED_CHAIN;
 115     }
 116   } else {
 117     if (libnbc_ireduce_algorithm == 1) {
 118       alg = NBC_RED_CHAIN;
 119     } else if (libnbc_ireduce_algorithm == 2) {
 120       alg = NBC_RED_BINOMIAL;
 121     } else if (libnbc_ireduce_algorithm == 3 && ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) {
 122       alg = NBC_RED_REDSCAT_GATHER;
 123     } else {
 124       alg = NBC_RED_CHAIN;
 125     }
 126   }
 127 
 128   /* allocate temporary buffers */
 129   if (alg == NBC_RED_REDSCAT_GATHER || alg == NBC_RED_BINOMIAL) {
 130     if (rank == root) {
 131       /* root reduces in receive buffer */
 132       tmpbuf = malloc(span);
 133       redbuf = recvbuf;
 134     } else {
 135       /* recvbuf may not be valid on non-root nodes */
 136       ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
 137       tmpbuf = malloc(span_align + span);
 138       redbuf = (char *)span_align - gap;
 139       tmpredbuf = 1;
 140     }
 141   } else {
 142     tmpbuf = malloc (span);
 143     segsize = 16384/2;
 144   }
 145 
 146   if (OPAL_UNLIKELY(NULL == tmpbuf)) {
 147     return OMPI_ERR_OUT_OF_RESOURCE;
 148   }
 149 
 150 #ifdef NBC_CACHE_SCHEDULE
 151   NBC_Reduce_args *args, *found, search;
 152 
 153   /* search schedule in communicator specific tree */
 154   search.sendbuf = sendbuf;
 155   search.recvbuf = recvbuf;
 156   search.count = count;
 157   search.datatype = datatype;
 158   search.op = op;
 159   search.root = root;
 160   found = (NBC_Reduce_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_REDUCE], &search);
 161   if (NULL == found) {
 162 #endif
 163     schedule = OBJ_NEW(NBC_Schedule);
 164     if (OPAL_UNLIKELY(NULL == schedule)) {
 165       free(tmpbuf);
 166       return OMPI_ERR_OUT_OF_RESOURCE;
 167     }
 168 
 169     if (p == 1) {
 170       res = NBC_Sched_copy ((void *)sendbuf, false, count, datatype,
 171                             recvbuf, false, count, datatype, schedule, false);
 172     } else {
 173       switch(alg) {
 174         case NBC_RED_BINOMIAL:
 175           res = red_sched_binomial(rank, p, root, sendbuf, redbuf, tmpredbuf, count, datatype, op, inplace, schedule, tmpbuf);
 176           break;
 177         case NBC_RED_CHAIN:
 178           res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, tmpbuf, segsize);
 179           break;
 180         case NBC_RED_REDSCAT_GATHER:
 181           res = red_sched_redscat_gather(rank, p, root, sendbuf, redbuf, tmpredbuf, count, datatype, op, inplace, schedule, tmpbuf, comm);
 182           break;
 183       }
 184     }
 185 
 186     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 187       OBJ_RELEASE(schedule);
 188       free(tmpbuf);
 189       return res;
 190     }
 191 
 192     res = NBC_Sched_commit(schedule);
 193     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 194       OBJ_RELEASE(schedule);
 195       free(tmpbuf);
 196       return res;
 197     }
 198 #ifdef NBC_CACHE_SCHEDULE
 199     /* save schedule to tree */
 200     args = (NBC_Reduce_args *) malloc (sizeof (args));
 201     if (NULL != args) {
 202       args->sendbuf = sendbuf;
 203       args->recvbuf = recvbuf;
 204       args->count = count;
 205       args->datatype = datatype;
 206       args->op = op;
 207       args->root = root;
 208       args->schedule = schedule;
 209       res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_REDUCE], args, args, 0);
 210       if (0 == res) {
 211         OBJ_RETAIN(schedule);
 212 
 213         /* increase number of elements for Reduce */
 214         if (++libnbc_module->NBC_Dict_size[NBC_REDUCE] > NBC_SCHED_DICT_UPPER) {
 215           NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_REDUCE],
 216                                    &libnbc_module->NBC_Dict_size[NBC_REDUCE]);
 217         }
 218       } else {
 219         NBC_Error("error in dict_insert() (%i)", res);
 220         free (args);
 221       }
 222     }
 223   } else {
 224     /* found schedule */
 225     schedule = found->schedule;
 226     OBJ_RETAIN(schedule);
 227   }
 228 #endif
 229 
 230   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 231   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 232     OBJ_RELEASE(schedule);
 233     free(tmpbuf);
 234     return res;
 235   }
 236 
 237   return OMPI_SUCCESS;
 238 }
 239 
 240 int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
 241                              MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request,
 242                              struct mca_coll_base_module_2_3_0_t *module) {
 243     int res = nbc_reduce_init(sendbuf, recvbuf, count, datatype, op, root,
 244                               comm, request, module, false);
 245     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 246         return res;
 247     }
 248     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 249     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 250         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 251         *request = &ompi_request_null.request;
 252         return res;
 253     }
 254 
 255     return OMPI_SUCCESS;
 256 }
 257 
 258 static int nbc_reduce_inter_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
 259                                  MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request,
 260                                  struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
 261   int rank, res, rsize;
 262   NBC_Schedule *schedule;
 263   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
 264   ptrdiff_t span, gap;
 265   void *tmpbuf;
 266 
 267   rank = ompi_comm_rank (comm);
 268   rsize = ompi_comm_remote_size (comm);
 269 
 270   span = opal_datatype_span(&datatype->super, count, &gap);
 271   tmpbuf = malloc (span);
 272   if (OPAL_UNLIKELY(NULL == tmpbuf)) {
 273     return OMPI_ERR_OUT_OF_RESOURCE;
 274   }
 275 
 276   schedule = OBJ_NEW(NBC_Schedule);
 277   if (OPAL_UNLIKELY(NULL == schedule)) {
 278     free(tmpbuf);
 279     return OMPI_ERR_OUT_OF_RESOURCE;
 280   }
 281 
 282   res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, (void *)(-gap), count, datatype, op, schedule);
 283   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 284     OBJ_RELEASE(schedule);
 285     free(tmpbuf);
 286     return OMPI_ERR_OUT_OF_RESOURCE;
 287   }
 288 
 289   res = NBC_Sched_commit(schedule);
 290   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 291     OBJ_RELEASE(schedule);
 292     free(tmpbuf);
 293     return res;
 294   }
 295 
 296   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 297   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 298     OBJ_RELEASE(schedule);
 299     free(tmpbuf);
 300     return OMPI_ERR_OUT_OF_RESOURCE;
 301   }
 302 
 303   return OMPI_SUCCESS;
 304 }
 305 
 306 int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
 307                                    MPI_Op op, int root, struct ompi_communicator_t *comm, ompi_request_t ** request,
 308                                    struct mca_coll_base_module_2_3_0_t *module) {
 309     int res = nbc_reduce_inter_init(sendbuf, recvbuf, count, datatype, op, root,
 310                                     comm, request, module, false);
 311     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 312         return res;
 313     }
 314     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 315     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 316         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 317         *request = &ompi_request_null.request;
 318         return res;
 319     }
 320 
 321     return OMPI_SUCCESS;
 322 }
 323 
 324 
 325 /* binomial reduce
 326  * if op is not commutative, reduce on rank 0, and then send the result to root rank
 327  *
 328  * working principle:
 329  * - each node gets a virtual rank vrank
 330  * - the 'root' node get vrank 0
 331  * - node 0 gets the vrank of the 'root'
 332  * - all other ranks stay identical (they do not matter)
 333  *
 334  * Algorithm:
 335  * pairwise exchange
 336  * round r:
 337  *  grp = rank % 2^r
 338  *  if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value
 339  *  if grp == 1: send to rank - 2^(r-1) and exit function
 340  *
 341  * do this for R=log_2(p) rounds
 342  *
 343  */
 344 #define RANK2VRANK(rank, vrank, root) \
 345 { \
 346   vrank = rank; \
 347   if (rank == 0) vrank = root; \
 348   if (rank == root) vrank = 0; \
 349 }
 350 #define VRANK2RANK(rank, vrank, root) \
 351 { \
 352   rank = vrank; \
 353   if (vrank == 0) rank = root; \
 354   if (vrank == root) rank = 0; \
 355 }
 356 static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, char tmpredbuf, int count, MPI_Datatype datatype,
 357                                       MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf) {
 358   int vroot, vrank, vpeer, peer, res, maxr;
 359   char *rbuf, *lbuf, *buf;
 360   int tmprbuf, tmplbuf;
 361   ptrdiff_t gap;
 362   (void)opal_datatype_span(&datatype->super, count, &gap);
 363 
 364   if (ompi_op_is_commute(op)) {
 365     vroot = root;
 366   } else {
 367     vroot = 0;
 368   }
 369   RANK2VRANK(rank, vrank, vroot);
 370   maxr = (int)ceil((log((double)p)/LOG2));
 371 
 372   if (rank != root) {
 373     inplace = 0;
 374   }
 375 
 376   /* ensure the result ends up in redbuf on vrank 0 */
 377   if (0 == (maxr%2)) {
 378     rbuf = (void *)(-gap);
 379     tmprbuf = true;
 380     lbuf = redbuf;
 381     tmplbuf = tmpredbuf;
 382   } else {
 383     lbuf = (void *)(-gap);
 384     tmplbuf = true;
 385     rbuf = redbuf;
 386     tmprbuf = tmpredbuf;
 387     if (inplace) {
 388         res = NBC_Sched_copy(rbuf, false, count, datatype,
 389                              ((char *)tmpbuf)-gap, false, count, datatype,
 390                              schedule, true);
 391         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 392           return res;
 393         }
 394     }
 395   }
 396 
 397   for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
 398     if ((vrank % (1 << r)) == 0) {
 399       /* we have to receive this round */
 400       vpeer = vrank + (1 << (r - 1));
 401       VRANK2RANK(peer, vpeer, vroot)
 402       if (peer < p) {
 403         int tbuf;
 404         /* we have to wait until we have the data */
 405         res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
 406         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 407           return res;
 408         }
 409 
 410         /* perform the reduce in my local buffer */
 411         /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
 412         if (firstred && !inplace) {
 413           /* perform the reduce with the senbuf */
 414           res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
 415           firstred = 0;
 416         } else {
 417           /* perform the reduce in my local buffer */
 418           res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
 419         }
 420 
 421         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 422           return res;
 423         }
 424         /* swap left and right buffers */
 425         buf = rbuf; rbuf = lbuf ; lbuf = buf;
 426         tbuf = tmprbuf; tmprbuf = tmplbuf; tmplbuf = tbuf;
 427       }
 428     } else {
 429       /* we have to send this round */
 430       vpeer = vrank - (1 << (r - 1));
 431       VRANK2RANK(peer, vpeer, vroot)
 432       if (firstred && !inplace) {
 433         /* we have to use the sendbuf in the first round .. */
 434         res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
 435       } else {
 436         /* and the redbuf in all remaining rounds */
 437         res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false);
 438       }
 439 
 440       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 441         return res;
 442       }
 443 
 444       /* leave the game */
 445       break;
 446     }
 447   }
 448   /* send to root if vroot ! root */
 449   if (vroot != root) {
 450     if (0 == rank) {
 451       res = NBC_Sched_send (redbuf, tmpredbuf, count, datatype, root, schedule, false);
 452     } else if (root == rank) {
 453       res = NBC_Sched_recv (redbuf, tmpredbuf, count, datatype, vroot, schedule, false);
 454     }
 455   }
 456 
 457   return OMPI_SUCCESS;
 458 }
 459 
 460 /* chain send ... */
 461 static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
 462                                    MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, void *tmpbuf, int fragsize) {
 463   int res, vrank, rpeer, speer, numfrag, fragcount, thiscount;
 464   long offset;
 465 
 466   RANK2VRANK(rank, vrank, root);
 467   VRANK2RANK(rpeer, vrank+1, root);
 468   VRANK2RANK(speer, vrank-1, root);
 469 
 470   if (0 == count) {
 471     return OMPI_SUCCESS;
 472   }
 473 
 474   numfrag = count * size / fragsize;
 475   if ((count * size) % fragsize != 0) {
 476     numfrag++;
 477   }
 478 
 479   fragcount = count / numfrag;
 480 
 481   for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) {
 482     offset = fragnum * fragcount * ext;
 483     thiscount = fragcount;
 484     if(fragnum == numfrag - 1) {
 485       /* last fragment may not be full */
 486       thiscount = count - fragcount * fragnum;
 487     }
 488 
 489     /* last node does not recv */
 490     if (vrank != p-1) {
 491       if (vrank == 0 && sendbuf != recvbuf) {
 492           res = NBC_Sched_recv ((char *)recvbuf+offset, false, thiscount, datatype, rpeer, schedule, true);
 493         } else {
 494           res = NBC_Sched_recv ((char *)offset, true, thiscount, datatype, rpeer, schedule, true);
 495         }
 496       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 497         return res;
 498       }
 499 
 500       /* root reduces into receivebuf */
 501       if(vrank == 0) {
 502         if (sendbuf != recvbuf) {
 503             res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) recvbuf + offset, false,
 504                                  thiscount, datatype, op, schedule, true);
 505         } else {
 506             res = NBC_Sched_op ((char *)offset, true, (char *) recvbuf + offset, false,
 507                                  thiscount, datatype, op, schedule, true);
 508         }
 509       } else {
 510         res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) offset, true, thiscount,
 511                              datatype, op, schedule, true);
 512       }
 513 
 514       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 515         return res;
 516       }
 517     }
 518 
 519     /* root does not send */
 520     if (vrank != 0) {
 521       /* rank p-1 has to send out of sendbuffer :) */
 522       /* the barrier here seems awkward but isn't!!!! */
 523       if (vrank == p-1) {
 524         res = NBC_Sched_send ((char *) sendbuf + offset, false, thiscount, datatype, speer, schedule, true);
 525       } else {
 526         res = NBC_Sched_send ((char *) offset, true, thiscount, datatype, speer, schedule, true);
 527       }
 528 
 529       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 530         return res;
 531       }
 532     }
 533   }
 534 
 535   return OMPI_SUCCESS;
 536 }
 537 
 538 /* simple linear algorithm for intercommunicators */
 539 static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype,
 540                                     MPI_Op op, NBC_Schedule *schedule) {
 541   int res;
 542   char *rbuf, *lbuf, *buf;
 543   int tmprbuf, tmplbuf;
 544 
 545   if (0 == count) {
 546     return OMPI_SUCCESS;
 547   }
 548 
 549   if (MPI_ROOT == root) {
 550     /* ensure the result ends up in recvbuf */
 551     if (0 == (rsize%2)) {
 552       lbuf = tmpbuf;
 553       tmplbuf = true;
 554       rbuf = recvbuf;
 555       tmprbuf = false;
 556     } else {
 557       rbuf = tmpbuf;
 558       tmprbuf = true;
 559       lbuf = recvbuf;
 560       tmplbuf = false;
 561     }
 562 
 563     res = NBC_Sched_recv (lbuf, tmplbuf, count, datatype, 0, schedule, false);
 564     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 565       return res;
 566     }
 567 
 568     for (int peer = 1 ; peer < rsize ; ++peer) {
 569       res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
 570       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 571         return res;
 572       }
 573 
 574       res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
 575       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 576         return res;
 577       }
 578       /* swap left and right buffers */
 579       buf = rbuf; rbuf = lbuf ; lbuf = buf;
 580       tmprbuf ^= 1; tmplbuf ^= 1;
 581     }
 582   } else if (MPI_PROC_NULL != root) {
 583     res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, true);
 584     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 585       return res;
 586     }
 587   }
 588 
 589   return OMPI_SUCCESS;
 590 }
 591 
 592 /*
 593  * red_sched_redscat_gather:
 594  *
 595  * Description: an implementation of Rabenseifner's Reduce algorithm [1, 2].
 596  *   [1] Rajeev Thakur, Rolf Rabenseifner and William Gropp.
 597  *       Optimization of Collective Communication Operations in MPICH //
 598  *       The Int. Journal of High Performance Computing Applications. Vol 19,
 599  *       Issue 1, pp. 49--66.
 600  *   [2] http://www.hlrs.de/mpi/myreduce.html.
 601  *
 602  * This algorithm is a combination of a reduce-scatter implemented with
 603  * recursive vector halving and recursive distance doubling, followed either
 604  * by a binomial tree gather.
 605  *
 606  * Step 1. If the number of processes is not a power of two, reduce it to
 607  * the nearest lower power of two (p' = 2^{\floor{\log_2 p}})
 608  * by removing r = p - p' extra processes as follows. In the first 2r processes
 609  * (ranks 0 to 2r - 1), all the even ranks send the second half of the input
 610  * vector to their right neighbor (rank + 1), and all the odd ranks send
 611  * the first half of the input vector to their left neighbor (rank - 1).
 612  * The even ranks compute the reduction on the first half of the vector and
 613  * the odd ranks compute the reduction on the second half. The odd ranks then
 614  * send the result to their left neighbors (the even ranks). As a result,
 615  * the even ranks among the first 2r processes now contain the reduction with
 616  * the input vector on their right neighbors (the odd ranks). These odd ranks
 617  * do not participate in the rest of the algorithm, which leaves behind
 618  * a power-of-two number of processes. The first r even-ranked processes and
 619  * the last p - 2r processes are now renumbered from 0 to p' - 1.
 620  *
 621  * Step 2. The remaining processes now perform a reduce-scatter by using
 622  * recursive vector halving and recursive distance doubling. The even-ranked
 623  * processes send the second half of their buffer to rank + 1 and the odd-ranked
 624  * processes send the first half of their buffer to rank - 1. All processes
 625  * then compute the reduction between the local buffer and the received buffer.
 626  * In the next log_2(p') - 1 steps, the buffers are recursively halved, and the
 627  * distance is doubled. At the end, each of the p' processes has 1 / p' of the
 628  * total reduction result.
 629  *
 630  * Step 3. A binomial tree gather is performed by using recursive vector
 631  * doubling and distance halving. In the non-power-of-two case, if the root
 632  * happens to be one of those odd-ranked processes that would normally
 633  * be removed in the first step, then the role of this process and process 0
 634  * are interchanged.
 635  *
 636  * Limitations:
 637  *   count >= 2^{\floor{\log_2 p}}
 638  *   commutative operations only
 639  *   intra-communicators only
 640  *
 641  * Memory requirements (per process):
 642  *   rank != root: 2 * count * typesize + 4 * \log_2(p) * sizeof(int) = O(count)
 643  *   rank == root: count * typesize + 4 * \log_2(p) * sizeof(int) = O(count)
 644  *
 645  * Schedule length (rounds): O(\log(p))
 646  * Recommendations: root = 0, otherwise it is required additional steps
 647  *                  in the root process.
 648  */
 649 static inline int red_sched_redscat_gather(
 650     int rank, int comm_size, int root, const void *sbuf, void *rbuf,
 651     char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace,
 652     NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm)
 653 {
 654     int res = OMPI_SUCCESS;
 655     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
 656 
 657     /* Find nearest power-of-two less than or equal to comm_size */
 658     int nsteps = opal_hibit(comm_size, comm->c_cube_dim + 1);   /* ilog2(comm_size) */
 659     if (nsteps < 1) {
 660         /* This case never happens (for comm_size < 2 other algorithms are used) */
 661         return OMPI_ERR_NOT_SUPPORTED;
 662     }
 663     int nprocs_pof2 = 1 << nsteps;                              /* flp2(comm_size) */
 664 
 665     ptrdiff_t lb, extent;
 666     ompi_datatype_get_extent(datatype, &lb, &extent);
 667 
 668     if ((rank != root) || !inplace) {
 669         res = NBC_Sched_copy((char *)sbuf, false, count, datatype,
 670                              rbuf, tmpredbuf, count, datatype, schedule, true);
 671         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 672     }
 673 
 674     /*
 675      * Step 1. Reduce the number of processes to the nearest lower power of two
 676      * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
 677      * 1. In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
 678      *    the second half of the input vector to their right neighbor (rank + 1)
 679      *    and all the odd ranks send the first half of the input vector to their
 680      *    left neighbor (rank - 1).
 681      * 2. All 2r processes compute the reduction on their half.
 682      * 3. The odd ranks then send the result to their left neighbors
 683      *    (the even ranks).
 684      *
 685      * The even ranks (0 to 2r - 1) now contain the reduction with the input
 686      * vector on their right neighbors (the odd ranks). The first r even
 687      * processes and the p - 2r last processes are renumbered from
 688      * 0 to 2^{\floor{\log_2 p}} - 1. These odd ranks do not participate in the
 689      * rest of the algorithm.
 690      */
 691 
 692     int vrank, step, wsize;
 693     int nprocs_rem = comm_size - nprocs_pof2;
 694 
 695     if (rank < 2 * nprocs_rem) {
 696         int count_lhalf = count / 2;
 697         int count_rhalf = count - count_lhalf;
 698 
 699         if (rank % 2 != 0) {
 700             /*
 701              * Odd process -- exchange with rank - 1
 702              * Send the left half of the input vector to the left neighbor,
 703              * Recv the right half of the input vector from the left neighbor
 704              */
 705             res = NBC_Sched_send(rbuf, tmpredbuf, count_lhalf, datatype, rank - 1,
 706                                  schedule, false);
 707             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 708 
 709             res = NBC_Sched_recv((char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
 710                                  false, count_rhalf, datatype, rank - 1, schedule, true);
 711             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 712 
 713             res = NBC_Sched_op((char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
 714                                false, (char *)rbuf + (ptrdiff_t)count_lhalf * extent,
 715                                tmpredbuf, count_rhalf, datatype, op, schedule, true);
 716             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 717 
 718             /* Send the right half to the left neighbor */
 719             res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
 720                                  tmpredbuf, count_rhalf, datatype, rank - 1, schedule, true);
 721             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 722 
 723             /* This process does not participate in recursive doubling phase */
 724             vrank = -1;
 725 
 726         } else {
 727             /*
 728              * Even process -- exchange with rank + 1
 729              * Send the right half of the input vector to the right neighbor,
 730              * Recv the left half of the input vector from the right neighbor
 731              */
 732             res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
 733                                  tmpredbuf, count_rhalf, datatype, rank + 1, schedule, false);
 734             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 735 
 736             res = NBC_Sched_recv((char *)tmp_buf, false, count_lhalf, datatype, rank + 1,
 737                                  schedule, true);
 738             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 739 
 740             res = NBC_Sched_op(tmp_buf, false, rbuf, tmpredbuf, count_lhalf,
 741                                datatype, op, schedule, true);
 742             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 743 
 744             /* Recv the right half from the right neighbor */
 745             res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
 746                                  tmpredbuf, count_rhalf, datatype, rank + 1, schedule, true);
 747             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 748 
 749             vrank = rank / 2;
 750         }
 751     } else { /* rank >= 2 * nprocs_rem */
 752         vrank = rank - nprocs_rem;
 753     }
 754 
 755     /*
 756      * Step 2. Reduce-scatter implemented with recursive vector halving and
 757      * recursive distance doubling. We have p' = 2^{\floor{\log_2 p}}
 758      * power-of-two number of processes with new ranks (vrank) and result in rbuf.
 759      *
 760      * The even-ranked processes send the right half of their buffer to rank + 1
 761      * and the odd-ranked processes send the left half of their buffer to
 762      * rank - 1. All processes then compute the reduction between the local
 763      * buffer and the received buffer. In the next \log_2(p') - 1 steps, the
 764      * buffers are recursively halved, and the distance is doubled. At the end,
 765      * each of the p' processes has 1 / p' of the total reduction result.
 766      */
 767 
 768     rindex = malloc(sizeof(*rindex) * nsteps);    /* O(\log_2(p)) */
 769     sindex = malloc(sizeof(*sindex) * nsteps);
 770     rcount = malloc(sizeof(*rcount) * nsteps);
 771     scount = malloc(sizeof(*scount) * nsteps);
 772     if (NULL == rindex || NULL == sindex || NULL == rcount || NULL == scount) {
 773         res = OMPI_ERR_OUT_OF_RESOURCE;
 774         goto cleanup_and_return;
 775     }
 776 
 777     if (vrank != -1) {
 778         step = 0;
 779         wsize = count;
 780         sindex[0] = rindex[0] = 0;
 781 
 782         for (int mask = 1; mask < nprocs_pof2; mask <<= 1) {
 783             /*
 784              * On each iteration: rindex[step] = sindex[step] -- begining of the
 785              * current window. Length of the current window is storded in wsize.
 786              */
 787             int vdest = vrank ^ mask;
 788             /* Translate vdest virtual rank to real rank */
 789             int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
 790 
 791             if (rank < dest) {
 792                 /*
 793                  * Recv into the left half of the current window, send the right
 794                  * half of the window to the peer (perform reduce on the left
 795                  * half of the current window)
 796                  */
 797                 rcount[step] = wsize / 2;
 798                 scount[step] = wsize - rcount[step];
 799                 sindex[step] = rindex[step] + rcount[step];
 800             } else {
 801                 /*
 802                  * Recv into the right half of the current window, send the left
 803                  * half of the window to the peer (perform reduce on the right
 804                  * half of the current window)
 805                  */
 806                 scount[step] = wsize / 2;
 807                 rcount[step] = wsize - scount[step];
 808                 rindex[step] = sindex[step] + scount[step];
 809             }
 810 
 811             /* Send part of data from the rbuf, recv into the tmp_buf */
 812             res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
 813                                  tmpredbuf, scount[step], datatype, dest, schedule, false);
 814             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 815             res = NBC_Sched_recv((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
 816                                  false, rcount[step], datatype, dest, schedule, true);
 817             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 818 
 819             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
 820             res = NBC_Sched_op((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
 821                                false, (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
 822                                tmpredbuf, rcount[step], datatype, op, schedule, true);
 823             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 824 
 825             /* Move the current window to the received message */
 826             if (step + 1 < nsteps) {
 827                 rindex[step + 1] = rindex[step];
 828                 sindex[step + 1] = rindex[step];
 829                 wsize = rcount[step];
 830                 step++;
 831             }
 832         }
 833     }
 834     /*
 835      * Assertion: each process has 1 / p' of the total reduction result:
 836      * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
 837      */
 838 
 839     /*
 840      * Setup the root process for gather operation.
 841      * Case 1: root < 2r and root is odd -- root process was excluded on step 1
 842      *         Recv data from process 0, vroot = 0, vrank = 0
 843      * Case 2: root < 2r and root is even: vroot = root / 2
 844      * Case 3: root >= 2r: vroot = root - r
 845      */
 846     int vroot = 0;
 847     if (root < 2 * nprocs_rem) {
 848         if (root % 2 != 0) {
 849             vroot = 0;
 850             if (rank == root) {
 851                 /*
 852                  * Case 1: root < 2r and root is odd -- root process was
 853                  * excluded on step 1 (newrank == -1).
 854                  * Recv a data from the process 0.
 855                  */
 856                 rindex[0] = 0;
 857                 step = 0, wsize = count;
 858                 for (int mask = 1; mask < nprocs_pof2; mask *= 2) {
 859                     rcount[step] = wsize / 2;
 860                     scount[step] = wsize - rcount[step];
 861                     rindex[step] = 0;
 862                     sindex[step] = rcount[step];
 863                     step++;
 864                     wsize /= 2;
 865                 }
 866 
 867                 res = NBC_Sched_recv(rbuf, tmpredbuf, rcount[nsteps - 1], datatype,
 868                                      0, schedule, true);
 869                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 870                 vrank = 0;
 871 
 872             } else if (vrank == 0) {
 873                 /* Send a data to the root */
 874                 res = NBC_Sched_send(rbuf, tmpredbuf, rcount[nsteps - 1], datatype,
 875                                      root, schedule, true);
 876                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 877                 vrank = -1;
 878             }
 879         } else {
 880             /* Case 2: root < 2r and a root is even: vroot = root / 2 */
 881             vroot = root / 2;
 882         }
 883     } else {
 884         /* Case 3: root >= 2r: newroot = root - r */
 885         vroot = root - nprocs_rem;
 886     }
 887 
 888     /*
 889      * Step 3. Gather result at the vroot by the binomial tree algorithm.
 890      * Each process has 1 / p' of the total reduction result:
 891      * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
 892      * All exchanges are executed in reverse order relative
 893      * to recursive doubling (previous step).
 894      */
 895 
 896     if (vrank != -1) {
 897         int vdest_tree, vroot_tree;
 898         step = nsteps - 1; /* step = ilog2(p') - 1 */
 899 
 900         for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) {
 901             int vdest = vrank ^ mask;
 902             /* Translate vdest virtual rank to real rank */
 903             int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
 904             if ((vdest == 0) && (root < 2 * nprocs_rem) && (root % 2 != 0))
 905                 dest = root;
 906 
 907             vdest_tree = vdest >> step;
 908             vdest_tree <<= step;
 909             vroot_tree = vroot >> step;
 910             vroot_tree <<= step;
 911             if (vdest_tree == vroot_tree) {
 912                 /* Send data from rbuf and exit */
 913 
 914                 res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)rindex[step] * extent,
 915                                      tmpredbuf, rcount[step], datatype, dest, schedule, false);
 916                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 917                 break;
 918             } else {
 919                 /* Recv and continue */
 920                 res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
 921                                      tmpredbuf, scount[step], datatype, dest, schedule, true);
 922                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 923             }
 924             step--;
 925         }
 926     }
 927 
 928   cleanup_and_return:
 929     if (NULL != rindex)
 930         free(rindex);
 931     if (NULL != sindex)
 932         free(sindex);
 933     if (NULL != rcount)
 934         free(rcount);
 935     if (NULL != scount)
 936         free(scount);
 937     return res;
 938 }
 939 
 940 int ompi_coll_libnbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
 941                                  MPI_Op op, int root, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 942                                  struct mca_coll_base_module_2_3_0_t *module) {
 943     int res = nbc_reduce_init(sendbuf, recvbuf, count, datatype, op, root,
 944                               comm, request, module, true);
 945     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 946         return res;
 947     }
 948 
 949     return OMPI_SUCCESS;
 950 }
 951 
 952 int ompi_coll_libnbc_reduce_inter_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype,
 953                                        MPI_Op op, int root, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 954                                        struct mca_coll_base_module_2_3_0_t *module) {
 955     int res = nbc_reduce_inter_init(sendbuf, recvbuf, count, datatype, op, root,
 956                                     comm, request, module, true);
 957     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 958         return res;
 959     }
 960 
 961     return OMPI_SUCCESS;
 962 }
/* [<][>][^][v][top][bottom][index][help] */
root/ompi/mca/coll/libnbc/nbc_ireduce.c

DEFINITIONS