root/ompi/mca/coll/base/coll_base_gather.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_coll_base_gather_intra_binomial
  2. ompi_coll_base_gather_intra_linear_sync
  3. ompi_coll_base_gather_intra_basic_linear

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2013      Los Alamos National Security, LLC. All Rights
  14  *                         reserved.
  15  * Copyright (c) 2015-2016 Research Organization for Information Science
  16  *                         and Technology (RIST). All rights reserved.
  17  * Copyright (c) 2017      IBM Corporation. All rights reserved.
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 
  25 #include "ompi_config.h"
  26 
  27 #include "mpi.h"
  28 #include "ompi/constants.h"
  29 #include "ompi/datatype/ompi_datatype.h"
  30 #include "ompi/communicator/communicator.h"
  31 #include "ompi/mca/coll/coll.h"
  32 #include "ompi/mca/coll/base/coll_tags.h"
  33 #include "ompi/mca/pml/pml.h"
  34 #include "ompi/mca/coll/base/coll_base_functions.h"
  35 #include "coll_base_topo.h"
  36 #include "coll_base_util.h"
  37 
  38 /* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
  39  * gather_intra_pipeline, segmentation? */
  40 int
  41 ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount,
  42                                       struct ompi_datatype_t *sdtype,
  43                                       void *rbuf, int rcount,
  44                                       struct ompi_datatype_t *rdtype,
  45                                       int root,
  46                                       struct ompi_communicator_t *comm,
  47                                       mca_coll_base_module_t *module)
  48 {
  49     int line = -1, i, rank, vrank, size, total_recv = 0, err;
  50     char *ptmp     = NULL, *tempbuf  = NULL;
  51     ompi_coll_tree_t* bmtree;
  52     MPI_Status status;
  53     MPI_Aint sextent, sgap = 0, ssize;
  54     MPI_Aint rextent, rgap = 0, rsize;
  55     mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
  56     mca_coll_base_comm_t *data = base_module->base_data;
  57 
  58     size = ompi_comm_size(comm);
  59     rank = ompi_comm_rank(comm);
  60 
  61     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
  62                  "ompi_coll_base_gather_intra_binomial rank %d", rank));
  63 
  64     /* create the binomial tree */
  65     COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
  66     bmtree = data->cached_in_order_bmtree;
  67 
  68     vrank = (rank - root + size) % size;
  69 
  70     if (rank == root) {
  71         ompi_datatype_type_extent(rdtype, &rextent);
  72         rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * size, &rgap);
  73         if (0 == root){
  74             /* root on 0, just use the recv buffer */
  75             ptmp = (char *) rbuf;
  76             if (sbuf != MPI_IN_PLACE) {
  77                 err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
  78                                            ptmp, rcount, rdtype);
  79                 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
  80             }
  81         } else {
  82             /* root is not on 0, allocate temp buffer for recv,
  83              * rotate data at the end */
  84             tempbuf = (char *) malloc(rsize);
  85             if (NULL == tempbuf) {
  86                 err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
  87             }
  88 
  89             ptmp = tempbuf - rgap;
  90             if (sbuf != MPI_IN_PLACE) {
  91                 /* copy from sbuf to temp buffer */
  92                 err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
  93                                            ptmp, rcount, rdtype);
  94                 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
  95             } else {
  96                 /* copy from rbuf to temp buffer  */
  97                 err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
  98                                                           (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
  99                 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 100             }
 101         }
 102         total_recv = rcount;
 103     } else if (!(vrank % 2)) {
 104         /* other non-leaf nodes, allocate temp buffer for data received from
 105          * children, the most we need is half of the total data elements due
 106          * to the property of binimoal tree */
 107         ompi_datatype_type_extent(sdtype, &sextent);
 108         ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap);
 109         tempbuf = (char *) malloc(ssize);
 110         if (NULL == tempbuf) {
 111             err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
 112         }
 113 
 114         ptmp = tempbuf - sgap;
 115         /* local copy to tempbuf */
 116         err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
 117                                    ptmp, scount, sdtype);
 118         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 119 
 120         /* use sdtype,scount as rdtype,rdcount since they are ignored on
 121          * non-root procs */
 122         rdtype = sdtype;
 123         rcount = scount;
 124         rextent = sextent;
 125         total_recv = rcount;
 126     } else {
 127         /* leaf nodes, no temp buffer needed, use sdtype,scount as
 128          * rdtype,rdcount since they are ignored on non-root procs */
 129         ptmp = (char *) sbuf;
 130         total_recv = scount;
 131     }
 132 
 133     if (!(vrank % 2)) {
 134         /* all non-leaf nodes recv from children */
 135         for (i = 0; i < bmtree->tree_nextsize; i++) {
 136             int mycount = 0, vkid;
 137             /* figure out how much data I have to send to this child */
 138             vkid = (bmtree->tree_next[i] - root + size) % size;
 139             mycount = vkid - vrank;
 140             if (mycount > (size - vkid))
 141                 mycount = size - vkid;
 142             mycount *= rcount;
 143 
 144             OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 145                          "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
 146                          rank, bmtree->tree_next[i], mycount));
 147 
 148             err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
 149                                     bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER,
 150                                     comm, &status));
 151             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 152 
 153             total_recv += mycount;
 154         }
 155     }
 156 
 157     if (rank != root) {
 158         /* all nodes except root send to parents */
 159         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 160                      "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
 161                      rank, bmtree->tree_prev, total_recv));
 162 
 163         err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
 164                                 bmtree->tree_prev,
 165                                 MCA_COLL_BASE_TAG_GATHER,
 166                                 MCA_PML_BASE_SEND_STANDARD, comm));
 167         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 168     }
 169 
 170     if (rank == root) {
 171         if (root != 0) {
 172             /* rotate received data on root if root != 0 */
 173             err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root),
 174                                                       (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp);
 175             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 176 
 177 
 178             err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root,
 179                                                       (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
 180             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 181 
 182             free(tempbuf);
 183         }
 184     } else if (!(vrank % 2)) {
 185         /* other non-leaf nodes */
 186         free(tempbuf);
 187     }
 188     return MPI_SUCCESS;
 189 
 190  err_hndl:
 191     if (NULL != tempbuf)
 192         free(tempbuf);
 193 
 194     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
 195                  __FILE__, line, err, rank));
 196     (void)line;  // silence compiler warning
 197     return err;
 198 }
 199 
 200 /*
 201  *      gather_intra_linear_sync
 202  *
 203  *      Function:       - synchronized gather operation with
 204  *      Accepts:        - same arguments as MPI_Gather(), first segment size
 205  *      Returns:        - MPI_SUCCESS or error code
 206  */
 207 int
 208 ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
 209                                          struct ompi_datatype_t *sdtype,
 210                                          void *rbuf, int rcount,
 211                                          struct ompi_datatype_t *rdtype,
 212                                          int root,
 213                                          struct ompi_communicator_t *comm,
 214                                          mca_coll_base_module_t *module,
 215                                          int first_segment_size)
 216 {
 217     int i, ret, line, rank, size, first_segment_count;
 218     ompi_request_t **reqs = NULL;
 219     MPI_Aint extent, lb;
 220     size_t typelng;
 221 
 222     size = ompi_comm_size(comm);
 223     rank = ompi_comm_rank(comm);
 224 
 225     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 226                  "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
 227 
 228     if (rank != root) {
 229         /* Non-root processes:
 230            - receive zero byte message from the root,
 231            - send the first segment of the data synchronously,
 232            - send the second segment of the data.
 233         */
 234 
 235         ompi_datatype_type_size(sdtype, &typelng);
 236         ompi_datatype_get_extent(sdtype, &lb, &extent);
 237         first_segment_count = scount;
 238         COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
 239                                       first_segment_count );
 240 
 241         ret = MCA_PML_CALL(recv(rbuf, 0, MPI_BYTE, root,
 242                                 MCA_COLL_BASE_TAG_GATHER,
 243                                 comm, MPI_STATUS_IGNORE));
 244         if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 245 
 246         ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root,
 247                                 MCA_COLL_BASE_TAG_GATHER,
 248                                 MCA_PML_BASE_SEND_STANDARD, comm));
 249         if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 250 
 251         ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
 252                                 (scount - first_segment_count), sdtype,
 253                                 root, MCA_COLL_BASE_TAG_GATHER,
 254                                 MCA_PML_BASE_SEND_STANDARD, comm));
 255         if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 256 
 257     } else {
 258 
 259         /* Root process,
 260            - For every non-root node:
 261            - post irecv for the first segment of the message
 262            - send zero byte message to signal node to send the message
 263            - post irecv for the second segment of the message
 264            - wait for the first segment to complete
 265            - Copy local data if necessary
 266            - Waitall for all the second segments to complete.
 267         */
 268         char *ptmp;
 269         ompi_request_t *first_segment_req;
 270         reqs = ompi_coll_base_comm_get_reqs(module->base_data, size);
 271         if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
 272 
 273         ompi_datatype_type_size(rdtype, &typelng);
 274         ompi_datatype_get_extent(rdtype, &lb, &extent);
 275         first_segment_count = rcount;
 276         COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
 277                                       first_segment_count );
 278 
 279         ptmp = (char *) rbuf;
 280         for (i = 0; i < size; ++i) {
 281             if (i == rank) {
 282                 /* skip myself */
 283                 reqs[i] = MPI_REQUEST_NULL;
 284                 continue;
 285             }
 286 
 287             /* irecv for the first segment from i */
 288             ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
 289             ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i,
 290                                      MCA_COLL_BASE_TAG_GATHER, comm,
 291                                      &first_segment_req));
 292             if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 293 
 294             /* send sync message */
 295             ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
 296                                     MCA_COLL_BASE_TAG_GATHER,
 297                                     MCA_PML_BASE_SEND_STANDARD, comm));
 298             if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 299 
 300             /* irecv for the second segment */
 301             ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
 302             ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
 303                                      rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
 304                                      &reqs[i]));
 305             if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 306 
 307             /* wait on the first segment to complete */
 308             ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE);
 309             if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 310         }
 311 
 312         /* copy local data if necessary */
 313         if (MPI_IN_PLACE != sbuf) {
 314             ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
 315                                        (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
 316                                        rcount, rdtype);
 317             if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 318         }
 319 
 320         /* wait all second segments to complete */
 321         ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
 322         if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
 323     }
 324 
 325     /* All done */
 326     return MPI_SUCCESS;
 327  error_hndl:
 328     if (NULL != reqs) {
 329         /* find a real error code */
 330         if (MPI_ERR_IN_STATUS == ret) {
 331             for( i = 0; i < size; i++ ) {
 332                 if (MPI_REQUEST_NULL == reqs[i]) continue;
 333                 if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
 334                 ret = reqs[i]->req_status.MPI_ERROR;
 335                 break;
 336             }
 337         }
 338         ompi_coll_base_free_reqs(reqs, size);
 339     }
 340     OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
 341                    "ERROR_HNDL: node %d file %s line %d error %d\n",
 342                    rank, __FILE__, line, ret ));
 343     (void)line;  // silence compiler warning
 344     return ret;
 345 }
 346 
 347 /*
 348  * Linear functions are copied from the BASIC coll module
 349  * they do not segment the message and are simple implementations
 350  * but for some small number of nodes and/or small data sizes they
 351  * are just as fast as base/tree based segmenting operations
 352  * and as such may be selected by the decision functions
 353  * These are copied into this module due to the way we select modules
 354  * in V1. i.e. in V2 we will handle this differently and so will not
 355  * have to duplicate code.
 356  * JPG following the examples from other coll_base implementations. Dec06.
 357  */
 358 
 359 /* copied function (with appropriate renaming) starts here */
 360 /*
 361  *      gather_intra
 362  *
 363  *      Function:       - basic gather operation
 364  *      Accepts:        - same arguments as MPI_Gather()
 365  *      Returns:        - MPI_SUCCESS or error code
 366  */
 367 int
 368 ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount,
 369                                           struct ompi_datatype_t *sdtype,
 370                                           void *rbuf, int rcount,
 371                                           struct ompi_datatype_t *rdtype,
 372                                           int root,
 373                                           struct ompi_communicator_t *comm,
 374                                           mca_coll_base_module_t *module)
 375 {
 376     int i, err, rank, size;
 377     char *ptmp;
 378     MPI_Aint incr, extent, lb;
 379 
 380     size = ompi_comm_size(comm);
 381     rank = ompi_comm_rank(comm);
 382 
 383     /* Everyone but root sends data and returns. */
 384     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
 385                  "ompi_coll_base_gather_intra_basic_linear rank %d", rank));
 386 
 387     if (rank != root) {
 388         return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
 389                                  MCA_COLL_BASE_TAG_GATHER,
 390                                  MCA_PML_BASE_SEND_STANDARD, comm));
 391     }
 392 
 393     /* I am the root, loop receiving the data. */
 394 
 395     ompi_datatype_get_extent(rdtype, &lb, &extent);
 396     incr = extent * (ptrdiff_t)rcount;
 397     for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
 398         if (i == rank) {
 399             if (MPI_IN_PLACE != sbuf) {
 400                 err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
 401                                            ptmp, rcount, rdtype);
 402             } else {
 403                 err = MPI_SUCCESS;
 404             }
 405         } else {
 406             err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i,
 407                                     MCA_COLL_BASE_TAG_GATHER,
 408                                     comm, MPI_STATUS_IGNORE));
 409         }
 410         if (MPI_SUCCESS != err) {
 411             return err;
 412         }
 413     }
 414 
 415     /* All done */
 416     return MPI_SUCCESS;
 417 }
 418 
 419 
 420 /* copied function (with appropriate renaming) ends here */

/* [<][>][^][v][top][bottom][index][help] */