root/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. nbc_reduce_scatter_block_init
  2. ompi_coll_libnbc_ireduce_scatter_block
  3. nbc_reduce_scatter_block_inter_init
  4. ompi_coll_libnbc_ireduce_scatter_block_inter
  5. ompi_coll_libnbc_reduce_scatter_block_init
  6. ompi_coll_libnbc_reduce_scatter_block_inter_init

   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2012      Sandia National Laboratories. All rights reserved.
   9  * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
  10  *                         reserved.
  11  * Copyright (c) 2014-2018 Research Organization for Information Science
  12  *                         and Technology (RIST).  All rights reserved.
  13  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  14  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  20  *
  21  */
  22 #include "opal/align.h"
  23 
  24 #include "nbc_internal.h"
  25 
  26 /* an reduce_csttare schedule can not be cached easily because the contents
  27  * ot the recvcount value may change, so a comparison of the address
  28  * would not be sufficient ... we simply do not cache it */
  29 
  30 /* binomial reduce to rank 0 followed by a linear scatter ...
  31  *
  32  * Algorithm:
  33  * pairwise exchange
  34  * round r:
  35  *  grp = rank % 2^r
  36  *  if grp == 0: receive from rank + 2^(r-1) if it exists and reduce value
  37  *  if grp == 1: send to rank - 2^(r-1) and exit function
  38  *
  39  * do this for R=log_2(p) rounds
  40  *
  41  */
  42 
  43 static int nbc_reduce_scatter_block_init(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
  44                                          MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
  45                                          struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  46   int peer, rank, maxr, p, res, count;
  47   MPI_Aint ext;
  48   ptrdiff_t gap, span;
  49   char *redbuf, *sbuf, inplace;
  50   NBC_Schedule *schedule;
  51   void *tmpbuf = NULL;
  52   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  53 
  54   NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  55 
  56   rank = ompi_comm_rank (comm);
  57   p = ompi_comm_size (comm);
  58 
  59   res = ompi_datatype_type_extent(datatype, &ext);
  60   if (MPI_SUCCESS != res || 0 == ext) {
  61     NBC_Error ("MPI Error in ompi_datatype_type_extent() (%i:%i)", res, (int) ext);
  62     return (MPI_SUCCESS == res) ? MPI_ERR_SIZE : res;
  63   }
  64 
  65   schedule = OBJ_NEW(NBC_Schedule);
  66   if (NULL == schedule) {
  67     return OMPI_ERR_OUT_OF_RESOURCE;
  68   }
  69 
  70   maxr = (int)ceil((log((double)p)/LOG2));
  71 
  72   count = p * recvcount;
  73 
  74   if (0 < count) {
  75     char *rbuf, *lbuf, *buf;
  76     ptrdiff_t span_align;
  77 
  78     span = opal_datatype_span(&datatype->super, count, &gap);
  79     span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
  80     tmpbuf = malloc (span_align + span);
  81     if (NULL == tmpbuf) {
  82       OBJ_RELEASE(schedule);
  83       return OMPI_ERR_OUT_OF_RESOURCE;
  84     }
  85 
  86     rbuf = (void *)(-gap);
  87     lbuf = (char *)(span_align - gap);
  88     redbuf = (char *) tmpbuf + span_align - gap;
  89 
  90     /* copy data to redbuf if we only have a single node */
  91     if ((p == 1) && !inplace) {
  92       res = NBC_Sched_copy ((void *)sendbuf, false, count, datatype,
  93                             redbuf, false, count, datatype, schedule, false);
  94       if (OMPI_SUCCESS != res) {
  95         OBJ_RELEASE(schedule);
  96         free(tmpbuf);
  97         return res;
  98       }
  99     }
 100 
 101     for (int r = 1, firstred = 1 ; r <= maxr; ++r) {
 102       if ((rank % (1 << r)) == 0) {
 103         /* we have to receive this round */
 104         peer = rank + (1 << (r - 1));
 105         if (peer < p) {
 106           /* we have to wait until we have the data */
 107           res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true);
 108           if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 109             OBJ_RELEASE(schedule);
 110             free(tmpbuf);
 111             return res;
 112           }
 113 
 114           if (firstred) {
 115             /* take reduce data from the sendbuf in the first round -> save copy */
 116             res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true);
 117             firstred = 0;
 118           } else {
 119           /* perform the reduce in my local buffer */
 120             res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true);
 121           }
 122 
 123           if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 124             OBJ_RELEASE(schedule);
 125             free(tmpbuf);
 126             return res;
 127           }
 128           /* swap left and right buffers */
 129           buf = rbuf; rbuf = lbuf ; lbuf = buf;
 130         }
 131       } else {
 132         /* we have to send this round */
 133         peer = rank - (1 << (r - 1));
 134         if(firstred) {
 135           /* we have to send the senbuf */
 136           res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
 137         } else {
 138           /* we send an already reduced value from redbuf */
 139           res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false);
 140         }
 141 
 142         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 143           OBJ_RELEASE(schedule);
 144           free(tmpbuf);
 145           return res;
 146         }
 147 
 148         /* leave the game */
 149         break;
 150       }
 151     }
 152 
 153     res = NBC_Sched_barrier(schedule);
 154     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 155       OBJ_RELEASE(schedule);
 156       free(tmpbuf);
 157       return res;
 158     }
 159 
 160     /* rank 0 is root and sends - all others receive */
 161     if (rank != 0) {
 162       res = NBC_Sched_recv (recvbuf, false, recvcount, datatype, 0, schedule, false);
 163       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 164         OBJ_RELEASE(schedule);
 165         free(tmpbuf);
 166         return res;
 167       }
 168     } else {
 169       for (int r = 1, offset = 0 ; r < p ; ++r) {
 170         offset += recvcount;
 171         sbuf = lbuf + (offset*ext);
 172         /* root sends the right buffer to the right receiver */
 173         res = NBC_Sched_send (sbuf, true, recvcount, datatype, r, schedule, false);
 174         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 175           OBJ_RELEASE(schedule);
 176           free(tmpbuf);
 177           return res;
 178         }
 179       }
 180 
 181       if ((p != 1) || !inplace) {
 182         res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount,
 183                               datatype, schedule, false);
 184       }
 185       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 186         OBJ_RELEASE(schedule);
 187         free(tmpbuf);
 188         return res;
 189       }
 190     }
 191   }
 192 
 193   res = NBC_Sched_commit (schedule);
 194   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 195     OBJ_RELEASE(schedule);
 196     free(tmpbuf);
 197     return res;
 198   }
 199 
 200   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 201   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 202     OBJ_RELEASE(schedule);
 203     free(tmpbuf);
 204     return res;
 205   }
 206 
 207   return OMPI_SUCCESS;
 208 }
 209 
 210 int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
 211                                            MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
 212                                            struct mca_coll_base_module_2_3_0_t *module) {
 213     int res = nbc_reduce_scatter_block_init(sendbuf, recvbuf, recvcount, datatype, op,
 214                                             comm, request, module, false);
 215     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 216         return res;
 217     }
 218     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 219     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 220         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 221         *request = &ompi_request_null.request;
 222         return res;
 223     }
 224 
 225     return OMPI_SUCCESS;
 226 }
 227 
 228 static int nbc_reduce_scatter_block_inter_init(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype,
 229                                                struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request,
 230                                                struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
 231   int rank, res, count, lsize, rsize;
 232   MPI_Aint ext;
 233   ptrdiff_t gap, span, span_align;
 234   NBC_Schedule *schedule;
 235   void *tmpbuf = NULL;
 236   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
 237 
 238   rank = ompi_comm_rank (comm);
 239   lsize = ompi_comm_size (comm);
 240   rsize = ompi_comm_remote_size (comm);
 241 
 242   res = ompi_datatype_type_extent (dtype, &ext);
 243   if (MPI_SUCCESS != res) {
 244     NBC_Error ("MPI Error in ompi_datatype_type_extent() (%i)", res);
 245     return res;
 246   }
 247 
 248   count = rcount * lsize;
 249 
 250   span = opal_datatype_span(&dtype->super, count, &gap);
 251   span_align = OPAL_ALIGN(span, dtype->super.align, ptrdiff_t);
 252 
 253   if (count > 0) {
 254     tmpbuf = malloc (span_align + span);
 255     if (NULL == tmpbuf) {
 256       return OMPI_ERR_OUT_OF_RESOURCE;
 257     }
 258   }
 259 
 260   schedule = OBJ_NEW(NBC_Schedule);
 261   if (NULL == schedule) {
 262     free(tmpbuf);
 263     return OMPI_ERR_OUT_OF_RESOURCE;
 264   }
 265 
 266   /* send my data to the remote root */
 267   res = NBC_Sched_send (sendbuf, false, count, dtype, 0, schedule, false);
 268   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 269     OBJ_RELEASE(schedule);
 270     free(tmpbuf);
 271     return res;
 272   }
 273 
 274   if (0 == rank) {
 275     char *lbuf, *rbuf;
 276     lbuf = (char *)(-gap);
 277     rbuf = (char *)(span_align-gap);
 278     res = NBC_Sched_recv (lbuf, true, count, dtype, 0, schedule, true);
 279     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 280       OBJ_RELEASE(schedule);
 281       free(tmpbuf);
 282       return res;
 283     }
 284 
 285     for (int peer = 1 ; peer < rsize ; ++peer) {
 286       char *tbuf;
 287       res = NBC_Sched_recv (rbuf, true, count, dtype, peer, schedule, true);
 288       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 289         OBJ_RELEASE(schedule);
 290         free(tmpbuf);
 291         return res;
 292       }
 293 
 294       res = NBC_Sched_op (lbuf, true, rbuf, true, count, dtype,
 295                           op, schedule, true);
 296       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 297         OBJ_RELEASE(schedule);
 298         free(tmpbuf);
 299         return res;
 300       }
 301       tbuf = lbuf; lbuf = rbuf; rbuf = tbuf;
 302     }
 303 
 304     /* do the scatter with the local communicator */
 305     res = NBC_Sched_copy (lbuf, true, rcount, dtype, recvbuf, false, rcount,
 306                           dtype, schedule, false);
 307     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 308       OBJ_RELEASE(schedule);
 309       free(tmpbuf);
 310       return res;
 311     }
 312     for (int peer = 1 ; peer < lsize ; ++peer) {
 313       res = NBC_Sched_local_send (lbuf + ext * rcount * peer, true, rcount, dtype, peer, schedule, false);
 314       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 315         OBJ_RELEASE(schedule);
 316         free(tmpbuf);
 317         return res;
 318       }
 319     }
 320   } else {
 321     /* receive my block */
 322     res = NBC_Sched_local_recv(recvbuf, false, rcount, dtype, 0, schedule, false);
 323     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 324       OBJ_RELEASE(schedule);
 325       free(tmpbuf);
 326       return res;
 327     }
 328   }
 329 
 330   /*NBC_PRINT_SCHED(*schedule);*/
 331 
 332   res = NBC_Sched_commit(schedule);
 333   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 334     OBJ_RELEASE(schedule);
 335     free(tmpbuf);
 336     return res;
 337   }
 338 
 339   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 340   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 341     OBJ_RELEASE(schedule);
 342     free(tmpbuf);
 343     return res;
 344   }
 345 
 346   return OMPI_SUCCESS;
 347 }
 348 
 349 int ompi_coll_libnbc_ireduce_scatter_block_inter(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
 350                                                  MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
 351                                                  struct mca_coll_base_module_2_3_0_t *module) {
 352     int res = nbc_reduce_scatter_block_inter_init(sendbuf, recvbuf, recvcount, datatype, op,
 353                                                   comm, request, module, false);
 354     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 355         return res;
 356     }
 357     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 358     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 359         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 360         *request = &ompi_request_null.request;
 361         return res;
 362     }
 363 
 364     return OMPI_SUCCESS;
 365 }
 366 
 367 int ompi_coll_libnbc_reduce_scatter_block_init(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
 368                                                MPI_Op op, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 369                                                struct mca_coll_base_module_2_3_0_t *module) {
 370     int res = nbc_reduce_scatter_block_init(sendbuf, recvbuf, recvcount, datatype, op,
 371                                             comm, request, module, true);
 372     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 373         return res;
 374     }
 375 
 376     return OMPI_SUCCESS;
 377 }
 378 
 379 int ompi_coll_libnbc_reduce_scatter_block_inter_init(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
 380                                                      MPI_Op op, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 381                                                      struct mca_coll_base_module_2_3_0_t *module) {
 382     int res = nbc_reduce_scatter_block_inter_init(sendbuf, recvbuf, recvcount, datatype, op,
 383                                                   comm, request, module, true);
 384     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 385         return res;
 386     }
 387 
 388     return OMPI_SUCCESS;
 389 }

/* [<][>][^][v][top][bottom][index][help] */