root/ompi/mca/coll/libnbc/nbc_iscan.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. NBC_Scan_args_compare
  2. nbc_scan_init
  3. scan_sched_linear
  4. scan_sched_recursivedoubling
  5. ompi_coll_libnbc_iscan
  6. ompi_coll_libnbc_scan_init

   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2014-2018 Research Organization for Information Science
   9  *                         and Technology (RIST).  All rights reserved.
  10  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  11  *                         reserved.
  12  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  13  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  19  *
  20  */
  21 #include "opal/align.h"
  22 #include "ompi/op/op.h"
  23 
  24 #include "nbc_internal.h"
  25 
  26 static inline int scan_sched_linear(
  27     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
  28     MPI_Datatype datatype,  MPI_Op op, char inplace, NBC_Schedule *schedule,
  29     void *tmpbuf);
  30 static inline int scan_sched_recursivedoubling(
  31     int rank, int comm_size, const void *sendbuf, void *recvbuf,
  32     int count, MPI_Datatype datatype,  MPI_Op op, char inplace,
  33     NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2);
  34 
  35 #ifdef NBC_CACHE_SCHEDULE
  36 /* tree comparison function for schedule cache */
  37 int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
  38   if ((a->sendbuf == b->sendbuf) &&
  39       (a->recvbuf == b->recvbuf) &&
  40       (a->count == b->count) &&
  41       (a->datatype == b->datatype) &&
  42       (a->op == b->op) ) {
  43     return 0;
  44   }
  45 
  46   if (a->sendbuf < b->sendbuf) {
  47     return -1;
  48   }
  49 
  50   return 1;
  51 }
  52 #endif
  53 
  54 static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
  55                          struct ompi_communicator_t *comm, ompi_request_t ** request,
  56                          struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
  57     int rank, p, res;
  58     ptrdiff_t gap, span;
  59     NBC_Schedule *schedule;
  60     void *tmpbuf = NULL, *tmpbuf1 = NULL, *tmpbuf2 = NULL;
  61     enum { NBC_SCAN_LINEAR, NBC_SCAN_RDBL } alg;
  62     char inplace;
  63     ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  64 
  65     NBC_IN_PLACE(sendbuf, recvbuf, inplace);
  66 
  67     rank = ompi_comm_rank (comm);
  68     p = ompi_comm_size (comm);
  69 
  70     if (count == 0) {
  71         return nbc_get_noop_request(persistent, request);
  72     }
  73 
  74     span = opal_datatype_span(&datatype->super, count, &gap);
  75     if (libnbc_iscan_algorithm == 2) {
  76         alg = NBC_SCAN_RDBL;
  77         ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t);
  78         tmpbuf = malloc(span_align + span);
  79         if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
  80         tmpbuf1 = (void *)(-gap);
  81         tmpbuf2 = (char *)(span_align) - gap;
  82     } else {
  83         alg = NBC_SCAN_LINEAR;
  84         if (rank > 0) {
  85             tmpbuf = malloc(span);
  86             if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; }
  87         }
  88     }
  89 
  90 #ifdef NBC_CACHE_SCHEDULE
  91   NBC_Scan_args *args, *found, search;
  92 
  93   /* search schedule in communicator specific tree */
  94   search.sendbuf = sendbuf;
  95   search.recvbuf = recvbuf;
  96   search.count = count;
  97   search.datatype = datatype;
  98   search.op = op;
  99   found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &search);
 100   if (NULL == found) {
 101 #endif
 102     schedule = OBJ_NEW(NBC_Schedule);
 103     if (OPAL_UNLIKELY(NULL == schedule)) {
 104         free(tmpbuf);
 105         return OMPI_ERR_OUT_OF_RESOURCE;
 106     }
 107 
 108     if (alg == NBC_SCAN_LINEAR) {
 109         res = scan_sched_linear(rank, p, sendbuf, recvbuf, count, datatype,
 110                                 op, inplace, schedule, tmpbuf);
 111     } else {
 112         res = scan_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count,
 113                                            datatype, op, inplace, schedule, tmpbuf1, tmpbuf2);
 114     }
 115     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 116         OBJ_RELEASE(schedule);
 117         free(tmpbuf);
 118         return res;
 119     }
 120 
 121     res = NBC_Sched_commit(schedule);
 122     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 123         OBJ_RELEASE(schedule);
 124         free(tmpbuf);
 125         return res;
 126     }
 127 
 128 #ifdef NBC_CACHE_SCHEDULE
 129     /* save schedule to tree */
 130     args = (NBC_Scan_args *) malloc (sizeof (args));
 131     if (NULL != args) {
 132       args->sendbuf = sendbuf;
 133       args->recvbuf = recvbuf;
 134       args->count = count;
 135       args->datatype = datatype;
 136       args->op = op;
 137       args->schedule = schedule;
 138       res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], args, args, 0);
 139       if (0 == res) {
 140         OBJ_RETAIN(schedule);
 141 
 142         /* increase number of elements for A2A */
 143         if (++libnbc_module->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) {
 144           NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN],
 145                                    &libnbc_module->NBC_Dict_size[NBC_SCAN]);
 146         }
 147       } else {
 148         NBC_Error("error in dict_insert() (%i)", res);
 149         free (args);
 150       }
 151     }
 152   } else {
 153     /* found schedule */
 154     schedule = found->schedule;
 155     OBJ_RETAIN(schedule);
 156   }
 157 #endif
 158 
 159     res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf);
 160     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 161         OBJ_RELEASE(schedule);
 162         free(tmpbuf);
 163         return res;
 164     }
 165 
 166     return OMPI_SUCCESS;
 167 }
 168 
 169 /*
 170  * scan_sched_linear:
 171  *
 172  * Function:  Linear algorithm for inclusive scan.
 173  * Accepts:   Same as MPI_Iscan
 174  * Returns:   MPI_SUCCESS or error code
 175  *
 176  * Working principle:
 177  * 1. Each process  (but process 0) receives from left neighbor
 178  * 2. Performs op
 179  * 3. All but rank p-1 do sends to it's right neighbor and exits
 180  *
 181  * Schedule length: O(1)
 182  */
 183 static inline int scan_sched_linear(
 184     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
 185     MPI_Datatype datatype,  MPI_Op op, char inplace, NBC_Schedule *schedule,
 186     void *tmpbuf)
 187 {
 188     int res = OMPI_SUCCESS;
 189 
 190     if (!inplace) {
 191         /* Copy data to recvbuf */
 192         res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
 193                              recvbuf, false, count, datatype, schedule, false);
 194         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 195     }
 196 
 197     if (rank > 0) {
 198         ptrdiff_t gap;
 199         opal_datatype_span(&datatype->super, count, &gap);
 200         /* We have to wait until we have the data */
 201         res = NBC_Sched_recv((void *)(-gap), true, count, datatype, rank - 1, schedule, true);
 202         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 203 
 204         /* Perform the reduce in my local buffer */
 205         /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
 206         res = NBC_Sched_op((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule,
 207                            true);
 208         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 209     }
 210 
 211     if (rank != comm_size - 1) {
 212         res = NBC_Sched_send(recvbuf, false, count, datatype, rank + 1, schedule, false);
 213         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 214     }
 215 
 216 cleanup_and_return:
 217     return res;
 218 }
 219 
 220 /*
 221  * scan_sched_recursivedoubling:
 222  *
 223  * Function:  Recursive doubling algorithm for inclusive scan.
 224  * Accepts:   Same as MPI_Iscan
 225  * Returns:   MPI_SUCCESS or error code
 226  *
 227  * Description:  Implements recursive doubling algorithm for MPI_Iscan.
 228  *               The algorithm preserves order of operations so it can
 229  *               be used both by commutative and non-commutative operations.
 230  *
 231  * Example for 5 processes and commutative operation MPI_SUM:
 232  * Process:   0                 1              2              3              4
 233  * recvbuf:  [0]               [1]            [2]            [3]            [4]
 234  *   psend:  [0]               [1]            [2]            [3]            [4]
 235  *
 236  *  Step 1:
 237  * recvbuf:  [0]               [0+1]          [2]            [2+3]          [4]
 238  *   psend:  [1+0]             [0+1]          [3+2]          [2+3]          [4]
 239  *
 240  *  Step 2:
 241  * recvbuf:  [0]               [0+1]          [(1+0)+2]      [(1+0)+(2+3)]  [4]
 242  *  psend:   [(3+2)+(1+0)]     [(2+3)+(0+1)]  [(1+0)+(3+2)]  [(1+0)+(2+3)]  [4]
 243  *
 244  *  Step 3:
 245  * recvbuf:  [0]               [0+1]           [(1+0)+2]     [(1+0)+(2+3)]  [((3+2)+(1+0))+4]
 246  *   psend:  [4+((3+2)+(1+0))]                                              [((3+2)+(1+0))+4]
 247  *
 248  * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma)
 249  * Memory requirements (per process): 2 * count * typesize = O(count)
 250  * Limitations: intra-communicators only
 251  * Schedule length: O(log(p))
 252  */
 253 static inline int scan_sched_recursivedoubling(
 254     int rank, int comm_size, const void *sendbuf, void *recvbuf, int count,
 255     MPI_Datatype datatype, MPI_Op op, char inplace,
 256     NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2)
 257 {
 258     int res = OMPI_SUCCESS;
 259 
 260     if (!inplace) {
 261         res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
 262                               recvbuf, false, count, datatype, schedule, true);
 263         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 264     }
 265     if (comm_size < 2)
 266         goto cleanup_and_return;
 267 
 268     char *psend = (char *)tmpbuf1;
 269     char *precv = (char *)tmpbuf2;
 270     res = NBC_Sched_copy(recvbuf, false, count, datatype,
 271                          psend, true, count, datatype, schedule, true);
 272     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 273 
 274     int is_commute = ompi_op_is_commute(op);
 275     for (int mask = 1; mask < comm_size; mask <<= 1) {
 276         int remote = rank ^ mask;
 277         if (remote < comm_size) {
 278             res = NBC_Sched_send(psend, true, count, datatype, remote, schedule, false);
 279             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 280             res = NBC_Sched_recv(precv, true, count, datatype, remote, schedule, true);
 281             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 282 
 283             if (rank > remote) {
 284                 /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
 285                 res = NBC_Sched_op(precv, true, recvbuf, false, count,
 286                                    datatype, op, schedule, false);
 287                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 288                 /* Partial result: psend = precv <op> psend */
 289                 res = NBC_Sched_op(precv, true, psend, true, count,
 290                                    datatype, op, schedule, true);
 291                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 292             } else {
 293                 if (is_commute) {
 294                     /* psend = precv <op> psend */
 295                     res = NBC_Sched_op(precv, true, psend, true, count,
 296                                        datatype, op, schedule, true);
 297                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 298                 } else {
 299                     /* precv = psend <op> precv */
 300                     res = NBC_Sched_op(psend, true, precv, true, count,
 301                                        datatype, op, schedule, true);
 302                     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 303                     char *tmp = psend;
 304                     psend = precv;
 305                     precv = tmp;
 306                 }
 307             }
 308         }
 309     }
 310 
 311  cleanup_and_return:
 312     return res;
 313 }
 314 
 315 int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
 316                            struct ompi_communicator_t *comm, ompi_request_t ** request,
 317                            struct mca_coll_base_module_2_3_0_t *module) {
 318     int res = nbc_scan_init(sendbuf, recvbuf, count, datatype, op,
 319                             comm, request, module, false);
 320     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 321         return res;
 322     }
 323     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 324     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 325         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 326         *request = &ompi_request_null.request;
 327         return res;
 328     }
 329 
 330     return OMPI_SUCCESS;
 331 }
 332 
 333 int ompi_coll_libnbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
 334                                struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 335                                struct mca_coll_base_module_2_3_0_t *module) {
 336     int res = nbc_scan_init(sendbuf, recvbuf, count, datatype, op,
 337                             comm, request, module, true);
 338     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 339         return res;
 340     }
 341 
 342     return OMPI_SUCCESS;
 343 }

/* [<][>][^][v][top][bottom][index][help] */