root/ompi/mca/coll/libnbc/nbc_iexscan.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. NBC_Scan_args_compare
  2. nbc_exscan_init
  3. ompi_coll_libnbc_iexscan
  4. ompi_coll_libnbc_exscan_init
  5. exscan_sched_linear
  6. exscan_sched_recursivedoubling

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
   9  *                         reserved.
  10  * Copyright (c) 2014-2018 Research Organization for Information Science
  11  *                         and Technology (RIST).  All rights reserved.
  12  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  13  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  19  *
  20  */
  21 #include "opal/align.h"
  22 #include "ompi/op/op.h"
  23 
  24 #include "nbc_internal.h"
  25 
  26 static inline int exscan_sched_linear(
  27     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
  28     MPI_Datatype datatype,  MPI_Op op, char inplace, NBC_Schedule *schedule,
  29     void *tmpbuf);
  30 static inline int exscan_sched_recursivedoubling(
  31     int rank, int comm_size, const void *sendbuf, void *recvbuf,
  32     int count, MPI_Datatype datatype,  MPI_Op op, char inplace,
  33     NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2);
  34 
  35 #ifdef NBC_CACHE_SCHEDULE
  36 /* tree comparison function for schedule cache */
  37 int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
  38     if ((a->sendbuf == b->sendbuf) &&
  39         (a->recvbuf == b->recvbuf) &&
  40         (a->count == b->count) &&
  41         (a->datatype == b->datatype) &&
  42         (a->op == b->op) ) {
  43         return 0;
  44     }
  45 
  46     if( a->sendbuf < b->sendbuf ) {
  47         return -1;
  48     }
  49 
  50     return 1;
  51 }
  52 #endif
  53 
  54 static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
  55                            struct ompi_communicator_t *comm, ompi_request_t ** request,
  56                            struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  57     int rank, p, res;
  58     NBC_Schedule *schedule;
  59     char inplace;
  60     void *tmpbuf = NULL, *tmpbuf1 = NULL, *tmpbuf2 = NULL;
  61     enum { NBC_EXSCAN_LINEAR, NBC_EXSCAN_RDBL } alg;
  62     ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  63     ptrdiff_t span, gap;
  64 
  65     NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  66 
  67     rank = ompi_comm_rank(comm);
  68     p = ompi_comm_size(comm);
  69 
  70     if (p < 2) {
  71         return nbc_get_noop_request(persistent, request);
  72     }
  73 
  74     span = opal_datatype_span(&datatype->super, count, &gap);
  75     if (libnbc_iexscan_algorithm == 2) {
  76         alg = NBC_EXSCAN_RDBL;
  77         ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
  78         tmpbuf = malloc(span_align + span);
  79         if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
  80         tmpbuf1 = (void *)(-gap);
  81         tmpbuf2 = (char *)(span_align) - gap;
  82     } else {
  83         alg = NBC_EXSCAN_LINEAR;
  84         if (rank > 0) {
  85             tmpbuf = malloc(span);
  86             if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
  87         }
  88     }
  89 
  90 #ifdef NBC_CACHE_SCHEDULE
  91     NBC_Scan_args *args, *found, search;
  92     /* search schedule in communicator specific tree */
  93     search.sendbuf = sendbuf;
  94     search.recvbuf = recvbuf;
  95     search.count = count;
  96     search.datatype = datatype;
  97     search.op = op;
  98     found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], &search);
  99     if (NULL == found) {
 100 #endif
 101     schedule = OBJ_NEW(NBC_Schedule);
 102     if (OPAL_UNLIKELY(NULL == schedule)) {
 103         free(tmpbuf);
 104         return OMPI_ERR_OUT_OF_RESOURCE;
 105     }
 106 
 107     if (alg == NBC_EXSCAN_LINEAR) {
 108         res = exscan_sched_linear(rank, p, sendbuf, recvbuf, count, datatype,
 109                                   op, inplace, schedule, tmpbuf);
 110     } else {
 111         res = exscan_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count,
 112                                              datatype, op, inplace, schedule, tmpbuf1, tmpbuf2);
 113     }
 114     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 115         OBJ_RELEASE(schedule);
 116         free(tmpbuf);
 117         return res;
 118     }
 119 
 120     res = NBC_Sched_commit(schedule);
 121     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 122        OBJ_RELEASE(schedule);
 123        free(tmpbuf);
 124        return res;
 125     }
 126 
 127 #ifdef NBC_CACHE_SCHEDULE
 128         /* save schedule to tree */
 129         args = (NBC_Scan_args *) malloc (sizeof (args));
 130         if (NULL != args) {
 131             args->sendbuf = sendbuf;
 132             args->recvbuf = recvbuf;
 133             args->count = count;
 134             args->datatype = datatype;
 135             args->op = op;
 136             args->schedule = schedule;
 137             res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], args, args, 0);
 138             if (0 == res) {
 139                 OBJ_RETAIN(schedule);
 140 
 141                 /* increase number of elements for A2A */
 142                 if (++libnbc_module->NBC_Dict_size[NBC_EXSCAN] > NBC_SCHED_DICT_UPPER) {
 143                     NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN],
 144                                              &libnbc_module->NBC_Dict_size[NBC_EXSCAN]);
 145                 }
 146             } else {
 147                 NBC_Error("error in dict_insert() (%i)", res);
 148                 free (args);
 149             }
 150         }
 151     } else {
 152         /* found schedule */
 153         schedule = found->schedule;
 154         OBJ_RETAIN(schedule);
 155     }
 156 #endif
 157 
 158     res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 159     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 160         OBJ_RELEASE(schedule);
 161         free(tmpbuf);
 162         return res;
 163     }
 164 
 165     return OMPI_SUCCESS;
 166 }
 167 
 168 int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
 169                              struct ompi_communicator_t *comm, ompi_request_t ** request,
 170                              struct mca_coll_base_module_2_3_0_t *module) {
 171     int res = nbc_exscan_init(sendbuf, recvbuf, count, datatype, op,
 172                               comm, request, module, false);
 173     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 174         return res;
 175     }
 176   
 177     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 178     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 179         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 180         *request = &ompi_request_null.request;
 181         return res;
 182     }
 183 
 184     return OMPI_SUCCESS;
 185 }
 186 
 187 int ompi_coll_libnbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
 188                                  struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 189                                  struct mca_coll_base_module_2_3_0_t *module) {
 190     int res = nbc_exscan_init(sendbuf, recvbuf, count, datatype, op,
 191                               comm, request, module, true);
 192     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 193         return res;
 194     }
 195 
 196     return OMPI_SUCCESS;
 197 }
 198 
 199 /*
 200  * exscan_sched_linear:
 201  *
 202  * Function:  Linear algorithm for exclusive scan.
 203  * Accepts:   Same as MPI_Iexscan
 204  * Returns:   MPI_SUCCESS or error code
 205  *
 206  * Working principle:
 207  * 1. Each process (but process 0) receives from left neighbor
 208  * 2. Performs op
 209  * 3. All but rank p - 1 do sends to it's right neighbor and exits
 210  *
 211  * Schedule length: O(1)
 212  */
 213 static inline int exscan_sched_linear(
 214     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
 215     MPI_Datatype datatype,  MPI_Op op, char inplace, NBC_Schedule *schedule,
 216     void *tmpbuf)
 217 {
 218     int res = OMPI_SUCCESS;
 219     ptrdiff_t gap;
 220     opal_datatype_span(&datatype->super, count, &gap);
 221 
 222     if (rank > 0) {
 223         if (inplace) {
 224             res = NBC_Sched_copy(recvbuf, false, count, datatype,
 225                                  (char *)tmpbuf - gap, false, count, datatype, schedule, false);
 226         } else {
 227             res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
 228                                  (char *)tmpbuf - gap, false, count, datatype, schedule, false);
 229         }
 230         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 231 
 232         res = NBC_Sched_recv(recvbuf, false, count, datatype, rank - 1, schedule, false);
 233         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 234 
 235         if (rank < comm_size - 1) {
 236             /* We have to wait until we have the data */
 237             res = NBC_Sched_barrier(schedule);
 238             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 239 
 240             res = NBC_Sched_op(recvbuf, false, (void *)(-gap), true, count,
 241                                datatype, op, schedule, true);
 242             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 243 
 244             /* Send reduced data onward */
 245             res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false);
 246             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 247         }
 248     } else if (comm_size > 1) {
 249         /* Process 0 */
 250         if (inplace) {
 251             res = NBC_Sched_send(recvbuf, false, count, datatype, 1, schedule, false);
 252         } else {
 253             res = NBC_Sched_send(sendbuf, false, count, datatype, 1, schedule, false);
 254         }
 255         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 256     }
 257 
 258 cleanup_and_return:
 259     return res;
 260 }
 261 
 262 /*
 263  * exscan_sched_recursivedoubling:
 264  *
 265  * Function:  Recursive doubling algorithm for exclusive scan.
 266  * Accepts:   Same as MPI_Iexscan
 267  * Returns:   MPI_SUCCESS or error code
 268  *
 269  * Description:  Implements recursive doubling algorithm for MPI_Iexscan.
 270  *               The algorithm preserves order of operations so it can
 271  *               be used both by commutative and non-commutative operations.
 272  *
 273  * Example for 5 processes and commutative operation MPI_SUM:
 274  * Process:  0                 1             2              3               4
 275  * recvbuf:  -                 -             -              -               -
 276  *   psend: [0]               [1]           [2]            [3]             [4]
 277  *
 278  *  Step 1:
 279  * recvbuf:  -                [0]            -             [2]              -
 280  *   psend: [1+0]             [0+1]         [3+2]          [2+3]           [4]
 281  *
 282  *  Step 2:
 283  * recvbuf:  -                [0]            [1+0]          [(0+1)+2]       -
 284  *   psend: [(3+2)+(1+0)]     [(2+3)+(0+1)]  [(1+0)+(3+2)]  [(1+0)+(2+3)]  [4]
 285  *
 286  *  Step 3:
 287  * recvbuf:  -                [0]            [1+0]          [(0+1)+2]      [(3+2)+(1+0)]
 288  *   psend: [4+((3+2)+(1+0))]                                              [((3+2)+(1+0))+4]
 289  *
 290  * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma)
 291  * Memory requirements (per process): 2 * count * typesize = O(count)
 292  * Limitations: intra-communicators only
 293  * Schedule length: O(log(p))
 294  */
 295 static inline int exscan_sched_recursivedoubling(
 296     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
 297     MPI_Datatype datatype, MPI_Op op, char inplace,
 298     NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2)
 299 {
 300     int res = OMPI_SUCCESS;
 301     char *psend = (char *)tmpbuf1;
 302     char *precv = (char *)tmpbuf2;
 303 
 304     if (!inplace) {
 305         res = NBC_Sched_copy((char *)sendbuf, false, count, datatype,
 306                              psend, true, count, datatype, schedule, true);
 307     } else {
 308         res = NBC_Sched_copy((char *)recvbuf, false, count, datatype,
 309                              psend, true, count, datatype, schedule, true);
 310     }
 311     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 312 
 313     int is_commute = ompi_op_is_commute(op);
 314     int is_first_block = 1;
 315 
 316     for (int mask = 1; mask < comm_size; mask <<= 1) {
 317         int remote = rank ^ mask;
 318         if (remote < comm_size) {
 319             res = NBC_Sched_send(psend, true, count, datatype, remote, schedule, false);
 320             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 321             res = NBC_Sched_recv(precv, true, count, datatype, remote, schedule, true);
 322             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 323 
 324             if (rank > remote) {
 325                 /* Assertion: rank > 0 and rbuf is valid */
 326                 if (is_first_block) {
 327                     res = NBC_Sched_copy(precv, true, count, datatype,
 328                                          recvbuf, false, count, datatype, schedule, false);
 329                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 330                     is_first_block = 0;
 331                 } else {
 332                     /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
 333                     res = NBC_Sched_op(precv, true, recvbuf, false, count,
 334                                        datatype, op, schedule, false);
 335                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 336                 }
 337                 /* Partial result: psend = precv <op> psend */
 338                 res = NBC_Sched_op(precv, true, psend, true, count,
 339                                    datatype, op, schedule, true);
 340                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 341             } else {
 342                 if (is_commute) {
 343                     /* psend = precv <op> psend */
 344                     res = NBC_Sched_op(precv, true, psend, true, count,
 345                                        datatype, op, schedule, true);
 346                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 347                 } else {
 348                     /* precv = psend <op> precv */
 349                     res = NBC_Sched_op(psend, true, precv, true, count,
 350                                        datatype, op, schedule, true);
 351                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 352                     char *tmp = psend;
 353                     psend = precv;
 354                     precv = tmp;
 355                 }
 356             }
 357         }
 358     }
 359 
 360 cleanup_and_return:
 361     return res;
 362 }

/* [<][>][^][v][top][bottom][index][help] */