root/ompi/mca/coll/libnbc/nbc_iallgather.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. NBC_Allgather_args_compare
  2. nbc_allgather_init
  3. ompi_coll_libnbc_iallgather
  4. nbc_allgather_inter_init
  5. ompi_coll_libnbc_iallgather_inter
  6. allgather_sched_linear
  7. allgather_sched_recursivedoubling
  8. ompi_coll_libnbc_allgather_init
  9. ompi_coll_libnbc_allgather_inter_init

   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2014-2017 Research Organization for Information Science
   9  *                         and Technology (RIST). All rights reserved.
  10  * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
  11  *                         reserved.
  12  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  13  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  19  *
  20  */
  21 #include "nbc_internal.h"
  22 
  23 static inline int allgather_sched_linear(
  24     int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf,
  25     int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount,
  26     struct ompi_datatype_t *rdtype);
  27 static inline int allgather_sched_recursivedoubling(
  28     int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf,
  29     int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount,
  30     struct ompi_datatype_t *rdtype);
  31 
  32 #ifdef NBC_CACHE_SCHEDULE
  33 /* tree comparison function for schedule cache */
  34 int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param) {
  35   if ((a->sendbuf == b->sendbuf) &&
  36       (a->sendcount == b->sendcount) &&
  37       (a->sendtype == b->sendtype) &&
  38       (a->recvbuf == b->recvbuf) &&
  39       (a->recvcount == b->recvcount) &&
  40       (a->recvtype == b->recvtype) ) {
  41     return 0;
  42   }
  43 
  44   if( a->sendbuf < b->sendbuf ) {
  45     return -1;
  46   }
  47 
  48   return 1;
  49 }
  50 #endif
  51 
  52 static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
  53                               MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request,
  54                               struct mca_coll_base_module_2_3_0_t *module, bool persistent)
  55 {
  56   int rank, p, res;
  57   MPI_Aint rcvext;
  58   NBC_Schedule *schedule;
  59   char *rbuf, inplace;
  60 #ifdef NBC_CACHE_SCHEDULE
  61   NBC_Allgather_args *args, *found, search;
  62 #endif
  63   enum { NBC_ALLGATHER_LINEAR, NBC_ALLGATHER_RDBL} alg;
  64   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  65 
  66   NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  67 
  68   rank = ompi_comm_rank (comm);
  69   p = ompi_comm_size (comm);
  70   int is_commsize_pow2 = !(p & (p - 1));
  71 
  72   if (libnbc_iallgather_algorithm == 0) {
  73     alg = NBC_ALLGATHER_LINEAR;
  74   } else {
  75     /* user forced dynamic decision */
  76     if (libnbc_iallgather_algorithm == 1) {
  77       alg = NBC_ALLGATHER_LINEAR;
  78     } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2) {
  79       alg = NBC_ALLGATHER_RDBL;
  80     } else {
  81       alg = NBC_ALLGATHER_LINEAR;
  82     }
  83   }
  84 
  85   res = ompi_datatype_type_extent(recvtype, &rcvext);
  86   if (MPI_SUCCESS != res) {
  87     return res;
  88   }
  89 
  90   if (inplace) {
  91     sendtype = recvtype;
  92     sendcount = recvcount;
  93   } else if (!persistent) { /* for persistent, the copy must be scheduled */
  94     /* copy my data to receive buffer */
  95     rbuf = (char *) recvbuf + rank * recvcount * rcvext;
  96     res = NBC_Copy (sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm);
  97     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
  98       return res;
  99     }
 100   }
 101   if (1 == p && (!persistent || inplace)) {
 102     return nbc_get_noop_request(persistent, request);
 103   }
 104 
 105 #ifdef NBC_CACHE_SCHEDULE
 106   /* search schedule in communicator specific tree */
 107   search.sendbuf = sendbuf;
 108   search.sendcount = sendcount;
 109   search.sendtype = sendtype;
 110   search.recvbuf = recvbuf;
 111   search.recvcount = recvcount;
 112   search.recvtype = recvtype;
 113   found = (NBC_Allgather_args *) hb_tree_search ((hb_tree*)libnbc_module->NBC_Dict[NBC_ALLGATHER], &search);
 114   if (NULL == found) {
 115 #endif
 116     schedule = OBJ_NEW(NBC_Schedule);
 117     if (OPAL_UNLIKELY(NULL == schedule)) {
 118       return OMPI_ERR_OUT_OF_RESOURCE;
 119     }
 120 
 121     if (persistent && !inplace) {
 122       /* for nonblocking, data has been copied already */
 123       /* copy my data to receive buffer (= send buffer of NBC_Sched_send) */
 124       rbuf = (char *)recvbuf + rank * recvcount * rcvext;
 125       res = NBC_Sched_copy((void *)sendbuf, false, sendcount, sendtype,
 126                             rbuf, false, recvcount, recvtype, schedule, true);
 127       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 128         OBJ_RELEASE(schedule);
 129         return res;
 130       }
 131     }
 132 
 133     switch (alg) {
 134       case NBC_ALLGATHER_LINEAR:
 135         res = allgather_sched_linear(rank, p, schedule, sendbuf, sendcount, sendtype,
 136                                      recvbuf, recvcount, recvtype);
 137         break;
 138       case NBC_ALLGATHER_RDBL:
 139         res = allgather_sched_recursivedoubling(rank, p, schedule, sendbuf, sendcount,
 140                                                 sendtype, recvbuf, recvcount, recvtype);
 141         break;
 142     }
 143 
 144     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 145       OBJ_RELEASE(schedule);
 146       return res;
 147     }
 148 
 149     res = NBC_Sched_commit(schedule);
 150     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 151       OBJ_RELEASE(schedule);
 152       return res;
 153     }
 154 
 155 #ifdef NBC_CACHE_SCHEDULE
 156     /* save schedule to tree */
 157     args = (NBC_Allgather_args *) malloc (sizeof (args));
 158     args->sendbuf = sendbuf;
 159     args->sendcount = sendcount;
 160     args->sendtype = sendtype;
 161     args->recvbuf = recvbuf;
 162     args->recvcount = recvcount;
 163     args->recvtype = recvtype;
 164     args->schedule = schedule;
 165 
 166     res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLGATHER], args, args, 0);
 167     if (res != 0) {
 168       free (args);
 169     } else {
 170       OBJ_RETAIN(schedule);
 171     }
 172 
 173     /* increase number of elements for A2A */
 174     if (++libnbc_module->NBC_Dict_size[NBC_ALLGATHER] > NBC_SCHED_DICT_UPPER) {
 175       NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLGATHER], &libnbc_module->NBC_Dict_size[NBC_ALLGATHER]);
 176     }
 177   } else {
 178     /* found schedule */
 179     schedule = found->schedule;
 180     OBJ_RETAIN(schedule);
 181   }
 182 #endif
 183 
 184   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL);
 185   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 186     OBJ_RELEASE(schedule);
 187     return res;
 188   }
 189 
 190   return OMPI_SUCCESS;
 191 }
 192 
 193 int ompi_coll_libnbc_iallgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
 194                                 MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request,
 195                                 struct mca_coll_base_module_2_3_0_t *module)
 196 {
 197     int res = nbc_allgather_init(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
 198                                  comm, request, module, false);
 199     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 200         return res;
 201     }
 202   
 203     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 204     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 205         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 206         *request = &ompi_request_null.request;
 207         return res;
 208     }
 209 
 210     return OMPI_SUCCESS;
 211 }
 212 
 213 static int nbc_allgather_inter_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
 214                                     MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request,
 215                                     struct mca_coll_base_module_2_3_0_t *module, bool persistent)
 216 {
 217   int res, rsize;
 218   MPI_Aint rcvext;
 219   NBC_Schedule *schedule;
 220   char *rbuf;
 221   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
 222 
 223   res = ompi_datatype_type_extent(recvtype, &rcvext);
 224   if (MPI_SUCCESS != res) {
 225     NBC_Error ("MPI Error in ompi_datatype_type_extent() (%i)", res);
 226     return res;
 227   }
 228 
 229   rsize = ompi_comm_remote_size (comm);
 230 
 231   /* set up schedule */
 232   schedule = OBJ_NEW(NBC_Schedule);
 233   if (OPAL_UNLIKELY(NULL == schedule)) {
 234     return OMPI_ERR_OUT_OF_RESOURCE;
 235   }
 236 
 237   /* do rsize - 1 rounds */
 238   for (int r = 0 ; r < rsize ; ++r) {
 239     /* recv from rank r */
 240     rbuf = (char *) recvbuf + r * recvcount * rcvext;
 241     res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, r, schedule, false);
 242     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 243       OBJ_RELEASE(schedule);
 244       return res;
 245     }
 246 
 247     /* send to rank r */
 248     res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, r, schedule, false);
 249     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 250       OBJ_RELEASE(schedule);
 251       return res;
 252     }
 253   }
 254 
 255   res = NBC_Sched_commit (schedule);
 256   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 257     OBJ_RELEASE(schedule);
 258     return res;
 259   }
 260 
 261   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL);
 262   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 263     OBJ_RELEASE(schedule);
 264     return res;
 265   }
 266 
 267   return OMPI_SUCCESS;
 268 }
 269 
 270 int ompi_coll_libnbc_iallgather_inter(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
 271                                       MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request,
 272                                       struct mca_coll_base_module_2_3_0_t *module) {
 273     int res = nbc_allgather_inter_init(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
 274                                        comm, request, module, false);
 275     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 276         return res;
 277     }
 278 
 279     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 280     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 281         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 282         *request = &ompi_request_null.request;
 283         return res;
 284     }
 285 
 286     return OMPI_SUCCESS;
 287 }
 288 
 289 /*
 290  * allgather_sched_linear
 291  *
 292  * Description: an implementation of Iallgather using linear algorithm
 293  *
 294  * Time: O(comm_size)
 295  * Schedule length (rounds): O(comm_size)
 296  */
 297 static inline int allgather_sched_linear(
 298     int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf,
 299     int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount,
 300     struct ompi_datatype_t *rdtype)
 301 {
 302     int res = OMPI_SUCCESS;
 303     ptrdiff_t rlb, rext;
 304 
 305     res = ompi_datatype_get_extent(rdtype, &rlb, &rext);
 306     char *sbuf = (char *)recvbuf + rank * rcount * rext;
 307 
 308     for (int remote = 0; remote < comm_size ; ++remote) {
 309         if (remote != rank) {
 310             /* Recv from rank remote */
 311             char *rbuf = (char *)recvbuf + remote * rcount * rext;
 312             res = NBC_Sched_recv(rbuf, false, rcount, rdtype, remote, schedule, false);
 313             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 314 
 315             /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */
 316             res = NBC_Sched_send(sbuf, false, rcount, rdtype, remote, schedule, false);
 317             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 318         }
 319     }
 320 
 321 cleanup_and_return:
 322     return res;
 323 }
 324 
 325 /*
 326  * allgather_sched_recursivedoubling
 327  *
 328  * Description: an implementation of Iallgather using recursive doubling algorithm
 329  * Limitation: power-of-two number of processes only
 330  * Time: O(log(comm_size))
 331  * Schedule length (rounds): O(log(comm_size))
 332  * Memory: no additional memory requirements beyond user-supplied buffers.
 333  *
 334  * Example on 4 nodes:
 335  *   Initialization: everyone has its own buffer at location rank in rbuf
 336  *    #     0      1      2      3
 337  *         [0]    [ ]    [ ]    [ ]
 338  *         [ ]    [1]    [ ]    [ ]
 339  *         [ ]    [ ]    [2]    [ ]
 340  *         [ ]    [ ]    [ ]    [3]
 341  *   Step 0: exchange data with (rank ^ 2^0)
 342  *    #     0      1      2      3
 343  *         [0]    [0]    [ ]    [ ]
 344  *         [1]    [1]    [ ]    [ ]
 345  *         [ ]    [ ]    [2]    [2]
 346  *         [ ]    [ ]    [3]    [3]
 347  *   Step 1: exchange data with (rank ^ 2^1) (if you can)
 348  *    #     0      1      2      3
 349  *         [0]    [0]    [0]    [0]
 350  *         [1]    [1]    [1]    [1]
 351  *         [2]    [2]    [2]    [2]
 352  *         [3]    [3]    [3]    [3]
 353  *
 354  */
 355 static inline int allgather_sched_recursivedoubling(
 356     int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf,
 357     int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount,
 358     struct ompi_datatype_t *rdtype)
 359 {
 360     int res = OMPI_SUCCESS;
 361     ptrdiff_t rlb, rext;
 362     char *tmpsend = NULL, *tmprecv = NULL;
 363 
 364     res = ompi_datatype_get_extent(rdtype, &rlb, &rext);
 365     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 366 
 367     int sendblocklocation = rank;
 368     for (int distance = 1; distance < comm_size; distance <<= 1) {
 369         int remote = rank ^ distance;
 370 
 371         tmpsend = (char *)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext;
 372         if (rank < remote) {
 373             tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext;
 374         } else {
 375             tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext;
 376             sendblocklocation -= distance;
 377         }
 378 
 379         res = NBC_Sched_send(tmpsend, false, (ptrdiff_t)distance * (ptrdiff_t)rcount,
 380                              rdtype, remote, schedule, false);
 381         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 382 
 383         res = NBC_Sched_recv(tmprecv, false, (ptrdiff_t)distance * (ptrdiff_t)rcount,
 384                              rdtype, remote, schedule, true);
 385         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 386     }
 387 
 388 cleanup_and_return:
 389     return res;
 390 }
 391 
 392 int ompi_coll_libnbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
 393                                     MPI_Datatype recvtype, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 394                                     struct mca_coll_base_module_2_3_0_t *module) {
 395     int res = nbc_allgather_init(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
 396                                  comm, request, module, true);
 397     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 398         return res;
 399     }
 400 
 401     return OMPI_SUCCESS;
 402 }
 403 
 404 int ompi_coll_libnbc_allgather_inter_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount,
 405                                           MPI_Datatype recvtype, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 406                                           struct mca_coll_base_module_2_3_0_t *module) {
 407     int res = nbc_allgather_inter_init(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
 408                                        comm, request, module, true);
 409     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 410         return res;
 411     }
 412 
 413     return OMPI_SUCCESS;
 414 }

/* [<][>][^][v][top][bottom][index][help] */