root/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. nbc_reduce_scatter_init
  2. ompi_coll_libnbc_ireduce_scatter
  3. nbc_reduce_scatter_inter_init
  4. ompi_coll_libnbc_ireduce_scatter_inter
  5. ompi_coll_libnbc_reduce_scatter_init
  6. ompi_coll_libnbc_reduce_scatter_inter_init

   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
   9  *                         reserved.
  10  * Copyright (c) 2014-2018 Research Organization for Information Science
  11  *                         and Technology (RIST).  All rights reserved.
  12  * Copyright (c) 2015      The University of Tennessee and The University
  13  *                         of Tennessee Research Foundation. All rights
  14  *                         reserved.
  15  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  16  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  22  *
  23  */
  24 #include "opal/align.h"
  25 
  26 #include "nbc_internal.h"
  27 
  28 /* an reduce_csttare schedule can not be cached easily because the contents
  29  * ot the recvcounts array may change, so a comparison of the address
  30  * would not be sufficient ... we simply do not cache it */
  31 
  32 /* binomial reduce to rank 0 followed by a linear scatter ...
  33  *
  34  * Algorithm:
  35  * pairwise exchange
  36  * round r:
  37  *  grp = rank % 2^r
  38  *  if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value
  39  *  if grp == 1: send to rank - 2^(r-1) and exit function
  40  *
  41  * do this for R=log_2(p) rounds
  42  *
  43  */
  44 
  45 static int nbc_reduce_scatter_init(const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
  46                                    MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
  47                                    struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  48   int peer, rank, maxr, p, res, count;
  49   MPI_Aint ext;
  50   ptrdiff_t gap, span, span_align;
  51   char *sbuf, inplace;
  52   NBC_Schedule *schedule;
  53   void *tmpbuf;
  54   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  55   char *rbuf, *lbuf, *buf;
  56 
  57   NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  58 
  59   rank = ompi_comm_rank (comm);
  60   p = ompi_comm_size (comm);
  61 
  62   res = ompi_datatype_type_extent (datatype, &ext);
  63   if (MPI_SUCCESS != res) {
  64     NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
  65     return res;
  66   }
  67 
  68   count = 0;
  69   for (int r = 0 ; r < p ; ++r) {
  70     count += recvcounts[r];
  71   }
  72 
  73   if ((1 == p && (!persistent || inplace)) || 0 == count) {
  74     if (!inplace) {
  75       /* single node not in_place: copy data to recvbuf */
  76       res = NBC_Copy(sendbuf, recvcounts[0], datatype, recvbuf, recvcounts[0], datatype, comm);
  77       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
  78         return res;
  79       }
  80     }
  81 
  82     return nbc_get_noop_request(persistent, request);
  83   }
  84 
  85   maxr = (int) ceil ((log((double) p) / LOG2));
  86 
  87   span = opal_datatype_span(&datatype->super, count, &gap);
  88   span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
  89   tmpbuf = malloc (span_align + span);
  90   if (OPAL_UNLIKELY(NULL == tmpbuf)) {
  91     return OMPI_ERR_OUT_OF_RESOURCE;
  92   }
  93 
  94   rbuf = (char *)(-gap);
  95   lbuf = (char *)(span_align - gap);
  96 
  97   schedule = OBJ_NEW(NBC_Schedule);
  98   if (OPAL_UNLIKELY(NULL == schedule)) {
  99     free(tmpbuf);
 100     return OMPI_ERR_OUT_OF_RESOURCE;
 101   }
 102 
 103   for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
 104     if ((rank % (1 << r)) == 0) {
 105       /* we have to receive this round */
 106       peer = rank + (1 << (r - 1));
 107       if (peer < p) {
 108         /* we have to wait until we have the data */
 109         res = NBC_Sched_recv(rbuf, true, count, datatype, peer, schedule, true);
 110         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 111           OBJ_RELEASE(schedule);
 112           free(tmpbuf);
 113           return res;
 114         }
 115 
 116         /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
 117         if (firstred) {
 118           /* take reduce data from the sendbuf in the first round -> save copy */
 119           res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true);
 120           firstred = 0;
 121         } else {
 122           /* perform the reduce in my local buffer */
 123           res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true);
 124         }
 125 
 126         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 127           OBJ_RELEASE(schedule);
 128           free(tmpbuf);
 129           return res;
 130         }
 131         /* swap left and right buffers */
 132         buf = rbuf; rbuf = lbuf ; lbuf = buf;
 133       }
 134     } else {
 135       /* we have to send this round */
 136       peer = rank - (1 << (r - 1));
 137       if (firstred) {
 138         /* we have to send the senbuf */
 139         res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
 140       } else {
 141         /* we send an already reduced value from lbuf */
 142         res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false);
 143       }
 144       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 145         OBJ_RELEASE(schedule);
 146         free(tmpbuf);
 147         return res;
 148       }
 149 
 150       /* leave the game */
 151       break;
 152     }
 153   }
 154 
 155   res = NBC_Sched_barrier(schedule);
 156   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 157     OBJ_RELEASE(schedule);
 158     free(tmpbuf);
 159     return res;
 160   }
 161 
 162   /* rank 0 is root and sends - all others receive */
 163   if (rank == 0) {
 164     for (long int r = 1, offset = 0 ; r < p ; ++r) {
 165       offset += recvcounts[r-1];
 166       sbuf = lbuf + (offset*ext);
 167       /* root sends the right buffer to the right receiver */
 168       res = NBC_Sched_send (sbuf, true, recvcounts[r], datatype, r, schedule,
 169                             false);
 170       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 171         OBJ_RELEASE(schedule);
 172         free(tmpbuf);
 173         return res;
 174       }
 175     }
 176 
 177     if (p == 1) {
 178       /* single node not in_place: copy data to recvbuf */
 179       res = NBC_Sched_copy ((void *)sendbuf, false, recvcounts[0], datatype,
 180                             recvbuf, false, recvcounts[0], datatype, schedule, false);
 181     } else {
 182       res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false,
 183                             recvcounts[0], datatype, schedule, false);
 184     }
 185   } else {
 186     res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false);
 187   }
 188 
 189   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 190     OBJ_RELEASE(schedule);
 191     free(tmpbuf);
 192     return res;
 193   }
 194 
 195   res = NBC_Sched_commit (schedule);
 196   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 197     OBJ_RELEASE(schedule);
 198     free(tmpbuf);
 199     return res;
 200   }
 201 
 202   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 203   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 204     OBJ_RELEASE(schedule);
 205     free(tmpbuf);
 206     return res;
 207   }
 208 
 209   return OMPI_SUCCESS;
 210 }
 211 
 212 int ompi_coll_libnbc_ireduce_scatter (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
 213                                       MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
 214                                       struct mca_coll_base_module_2_3_0_t *module) {
 215     int res = nbc_reduce_scatter_init(sendbuf, recvbuf, recvcounts, datatype, op,
 216                                       comm, request, module, false);
 217     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 218         return res;
 219     }
 220     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 221     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 222         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 223         *request = &ompi_request_null.request;
 224         return res;
 225     }
 226 
 227     return OMPI_SUCCESS;
 228 }
 229 static int nbc_reduce_scatter_inter_init (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
 230                                           MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
 231                                           struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
 232   int rank, res, count, lsize, rsize;
 233   MPI_Aint ext;
 234   ptrdiff_t gap, span, span_align;
 235   NBC_Schedule *schedule;
 236   void *tmpbuf = NULL;
 237   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
 238 
 239   rank = ompi_comm_rank (comm);
 240   lsize = ompi_comm_size(comm);
 241   rsize = ompi_comm_remote_size (comm);
 242 
 243   res = ompi_datatype_type_extent (datatype, &ext);
 244   if (MPI_SUCCESS != res) {
 245     NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
 246     return res;
 247   }
 248 
 249   count = 0;
 250   for (int r = 0 ; r < lsize ; ++r) {
 251     count += recvcounts[r];
 252   }
 253 
 254   span = opal_datatype_span(&datatype->super, count, &gap);
 255   span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
 256 
 257   if (count > 0) {
 258     tmpbuf = malloc (span_align + span);
 259     if (OPAL_UNLIKELY(NULL == tmpbuf)) {
 260       return OMPI_ERR_OUT_OF_RESOURCE;
 261     }
 262   }
 263 
 264   schedule = OBJ_NEW(NBC_Schedule);
 265   if (OPAL_UNLIKELY(NULL == schedule)) {
 266     free(tmpbuf);
 267     return OMPI_ERR_OUT_OF_RESOURCE;
 268   }
 269 
 270   /* send my data to the remote root */
 271   res = NBC_Sched_send(sendbuf, false, count, datatype, 0, schedule, false);
 272   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 273     OBJ_RELEASE(schedule);
 274     free(tmpbuf);
 275     return res;
 276   }
 277 
 278   if (0 == rank) {
 279     char *lbuf, *rbuf;
 280     lbuf = (char *)(-gap);
 281     rbuf = (char *)(span_align-gap);
 282     res = NBC_Sched_recv (lbuf, true, count, datatype, 0, schedule, true);
 283     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 284       OBJ_RELEASE(schedule);
 285       free(tmpbuf);
 286       return res;
 287     }
 288 
 289     for (int peer = 1 ; peer < rsize ; ++peer) {
 290       char *tbuf;
 291       res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true);
 292       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 293         OBJ_RELEASE(schedule);
 294         free(tmpbuf);
 295         return res;
 296       }
 297 
 298       res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype,
 299                           op, schedule, true);
 300       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 301         OBJ_RELEASE(schedule);
 302         free(tmpbuf);
 303         return res;
 304       }
 305       tbuf = lbuf; lbuf = rbuf; rbuf = tbuf;
 306     }
 307 
 308     /* do the local scatterv with the local communicator */
 309     res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false,
 310                           recvcounts[0], datatype, schedule, false);
 311     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 312       OBJ_RELEASE(schedule);
 313       free(tmpbuf);
 314       return res;
 315     }
 316     for (int peer = 1, offset = recvcounts[0] * ext; peer < lsize ; ++peer) {
 317       res = NBC_Sched_local_send (lbuf + offset, true, recvcounts[peer], datatype, peer, schedule,
 318                                   false);
 319       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 320         OBJ_RELEASE(schedule);
 321         free(tmpbuf);
 322         return res;
 323       }
 324 
 325       offset += recvcounts[peer] * ext;
 326     }
 327   } else {
 328     /* receive my block */
 329     res = NBC_Sched_local_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false);
 330     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 331       OBJ_RELEASE(schedule);
 332       free(tmpbuf);
 333       return res;
 334     }
 335   }
 336 
 337   res = NBC_Sched_commit (schedule);
 338   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 339     OBJ_RELEASE(schedule);
 340     free(tmpbuf);
 341     return res;
 342   }
 343 
 344   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 345   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 346     OBJ_RELEASE(schedule);
 347     free(tmpbuf);
 348     return res;
 349   }
 350 
 351   return OMPI_SUCCESS;
 352 }
 353 
 354 int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
 355                                             MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
 356                                             struct mca_coll_base_module_2_3_0_t *module) {
 357     int res = nbc_reduce_scatter_inter_init(sendbuf, recvbuf, recvcounts, datatype, op,
 358                                             comm, request, module, false);
 359     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 360         return res;
 361     }
 362     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 363     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 364         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 365         *request = &ompi_request_null.request;
 366         return res;
 367     }
 368 
 369     return OMPI_SUCCESS;
 370 }
 371 
 372 int ompi_coll_libnbc_reduce_scatter_init(const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
 373                                          MPI_Op op, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 374                                          struct mca_coll_base_module_2_3_0_t *module) {
 375     int res = nbc_reduce_scatter_init(sendbuf, recvbuf, recvcounts, datatype, op,
 376                                       comm, request, module, true);
 377     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 378         return res;
 379     }
 380 
 381     return OMPI_SUCCESS;
 382 }
 383 
 384 int ompi_coll_libnbc_reduce_scatter_inter_init(const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
 385                                                MPI_Op op, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 386                                                struct mca_coll_base_module_2_3_0_t *module) {
 387     int res = nbc_reduce_scatter_inter_init(sendbuf, recvbuf, recvcounts, datatype, op,
 388                                             comm, request, module, true);
 389     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 390         return res;
 391     }
 392 
 393     return OMPI_SUCCESS;
 394 }

/* [<][>][^][v][top][bottom][index][help] */