root/ompi/mca/coll/portals4/coll_portals4_gather.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. get_tree_numdescendants_of
  2. ompi_coll_portals4_build_in_order_bmtree
  3. ompi_coll_portals4_destroy_tree
  4. setup_gather_buffers_binomial
  5. setup_gather_buffers_linear
  6. setup_gather_handles
  7. setup_sync_handles
  8. cleanup_gather_handles
  9. cleanup_sync_handles
  10. ompi_coll_portals4_gather_intra_binomial_top
  11. ompi_coll_portals4_gather_intra_linear_top
  12. ompi_coll_portals4_gather_intra_binomial_bottom
  13. ompi_coll_portals4_gather_intra_linear_bottom
  14. ompi_coll_portals4_gather_intra
  15. ompi_coll_portals4_igather_intra
  16. ompi_coll_portals4_igather_intra_fini

   1 /*
   2  * Copyright (c) 2015      Sandia National Laboratories. All rights reserved.
   3  * $COPYRIGHT$
   4  * 
   5  * Additional copyrights may follow
   6  * 
   7  * $HEADER$
   8  */
   9 
  10 
  11 #include "ompi_config.h"
  12 
  13 #include "mpi.h"
  14 #include "ompi/constants.h"
  15 #include "ompi/datatype/ompi_datatype.h"
  16 #include "opal/util/bit_ops.h"
  17 #include "ompi/mca/pml/pml.h"
  18 #include "ompi/mca/coll/coll.h"
  19 #include "ompi/mca/coll/base/base.h"
  20 
  21 #include "coll_portals4.h"
  22 #include "coll_portals4_request.h"
  23 
  24 #include <string.h> // included for ffs in get_tree_numdescendants_of
  25 
  26 #undef RTR_USES_TRIGGERED_PUT
  27 
  28 
  29 #define VRANK(ra, ro, si) ((ra - ro + si) % si)
  30 
  31 /*
  32  * Borrowed with thanks from the coll-tuned component, then modified for Portals4.
  33  *
  34  *
  35  * Constructs in-order binomial tree which can be used for gather/scatter
  36  * operations.
  37  *
  38  * Here are some of the examples of this tree:
  39  * size = 2                    size = 4                 size = 8
  40  *      0                           0                        0
  41  *     /                          / |                      / | \
  42  *    1                          1  2                     1  2  4
  43  *                                  |                        |  | \
  44  *                                  3                        3  5  6
  45  *                                                                 |
  46  *                                                                 7
  47  *
  48  * size = 16
  49  *      0
  50  *    / | \        \
  51  *   1  2  4        8
  52  *      |  | \    / | \
  53  *      3  5  6  9  10 12
  54  *            |     |  | \
  55  *            7     11 13 14
  56  *                        |
  57  *                        15
  58  */
  59 
  60 static int32_t get_tree_numdescendants_of(struct ompi_communicator_t* comm,
  61                                           int vrank)
  62 {
  63     int max;
  64     int size = ompi_comm_size(comm);
  65 
  66     if (0  == vrank) {
  67         return  size - 1;
  68     } else {
  69         max = 1 << ffs(vrank - 1);
  70         return ((vrank + max <= size ) ? max : size - vrank) -1;
  71     }
  72 
  73 }
  74 
  75 static ompi_coll_portals4_tree_t*
  76 ompi_coll_portals4_build_in_order_bmtree( struct ompi_communicator_t* comm,
  77                                             int root )
  78 {
  79     int childs = 0, rank, vrank, vparent, size, mask = 1, remote, i;
  80     ompi_coll_portals4_tree_t *bmtree;
  81 
  82     /*
  83      * Get size and rank of the process in this communicator
  84      */
  85     size = ompi_comm_size(comm);
  86     rank = ompi_comm_rank(comm);
  87 
  88     vrank = VRANK(rank, root, size);
  89 
  90     bmtree = (ompi_coll_portals4_tree_t*)malloc(sizeof(ompi_coll_portals4_tree_t));
  91     if (!bmtree) {
  92         opal_output(ompi_coll_base_framework.framework_output,
  93                     "coll:portals4:build_bmtree PANIC out of memory");
  94         return NULL;
  95     }
  96 
  97     bmtree->tree_bmtree   = 1;
  98     bmtree->tree_root     = MPI_UNDEFINED;
  99     bmtree->tree_nextsize = MPI_UNDEFINED;
 100     for(i=0;i<MAXTREEFANOUT;i++) {
 101         bmtree->tree_next[i] = -1;
 102     }
 103 
 104     if (root == rank) {
 105         bmtree->tree_prev = root;
 106     }
 107 
 108     while (mask < size) {
 109         remote = vrank ^ mask;
 110         if (remote < vrank) {
 111             bmtree->tree_prev = (remote + root) % size;
 112             break;
 113         } else if (remote < size) {
 114             bmtree->tree_next[childs] = (remote + root) % size;
 115             childs++;
 116             if (childs==MAXTREEFANOUT) {
 117                 opal_output(ompi_coll_base_framework.framework_output,
 118                              "coll:portals4:build_bmtree max fanout incorrect %d needed %d",
 119                              MAXTREEFANOUT, childs);
 120                 return NULL;
 121             }
 122         }
 123         mask <<= 1;
 124     }
 125     bmtree->tree_nextsize = childs;
 126     bmtree->tree_root     = root;
 127 
 128     vparent = VRANK(bmtree->tree_prev, root, size);
 129     if (root == rank) {
 130         bmtree->tree_numdescendants = size - 1;
 131     } else if (bmtree->tree_nextsize > 0) {
 132         int possible_descendants = vrank - vparent - 1;
 133         if ((vrank + possible_descendants) > size) {
 134             bmtree->tree_numdescendants = size - vrank - 1;
 135         } else {
 136             bmtree->tree_numdescendants = possible_descendants;
 137         }
 138     } else {
 139         bmtree->tree_numdescendants = 0;
 140     }
 141 
 142     opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 143                         "%d: bmtree result - size(%d)  rank(%d)  vrank(%d)  root(%d)  parent(%d) vparent(%d)  numkids(%d)  numdescendants(%d)",
 144                         __LINE__,
 145                         size, rank, vrank, bmtree->tree_root, bmtree->tree_prev, vparent, bmtree->tree_nextsize, bmtree->tree_numdescendants);
 146 
 147     return bmtree;
 148 }
 149 
 150 /*
 151  * Borrowed with thanks from the coll-tuned component.
 152  */
 153 static int
 154 ompi_coll_portals4_destroy_tree( ompi_coll_portals4_tree_t** tree )
 155 {
 156     ompi_coll_portals4_tree_t *ptr;
 157 
 158     if ((!tree)||(!*tree)) {
 159         return OMPI_SUCCESS;
 160     }
 161 
 162     ptr = *tree;
 163 
 164     free (ptr);
 165     *tree = NULL;   /* mark tree as gone */
 166 
 167     return OMPI_SUCCESS;
 168 }
 169 
 170 
 171 static int
 172 setup_gather_buffers_binomial(struct ompi_communicator_t   *comm,
 173                               ompi_coll_portals4_request_t *request,
 174                               mca_coll_portals4_module_t   *portals4_module)
 175 {
 176     int ret, line;
 177 
 178     uint32_t iov_count = 1;
 179     struct iovec iov;
 180     size_t max_data;
 181 
 182     ompi_coll_portals4_tree_t* bmtree = portals4_module->cached_in_order_bmtree;
 183 
 184     int vrank = VRANK(request->u.gather.my_rank, request->u.gather.root_rank, request->u.gather.size);
 185 
 186     ompi_coll_portals4_create_send_converter (&request->u.gather.send_converter,
 187                                               request->u.gather.pack_src_buf + request->u.gather.pack_src_offset,
 188                                               ompi_comm_peer_lookup(comm, request->u.gather.my_rank),
 189                                               request->u.gather.pack_src_count,
 190                                               request->u.gather.pack_src_dtype);
 191     opal_convertor_get_packed_size(&request->u.gather.send_converter, &request->u.gather.packed_size);
 192 
 193     /**********************************/
 194     /* Setup Gather Buffers           */
 195     /**********************************/
 196     if (vrank == 0) {
 197         request->u.gather.gather_bytes=request->u.gather.packed_size * (ptrdiff_t)request->u.gather.size;
 198 
 199         /*
 200          * root node, needs to allocate temp buffer to gather
 201          * packed bytes from all nodes including self.
 202          * rotate will occur after transfer during unpack.
 203          */
 204         request->u.gather.gather_buf = (char *) malloc(request->u.gather.gather_bytes);
 205         if (NULL == request->u.gather.gather_buf) {
 206             ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
 207         }
 208         request->u.gather.free_after = 1;
 209 
 210         /* pack local data into request->u.gather.gather_buf */
 211         iov.iov_len = request->u.gather.gather_bytes;
 212         iov.iov_base = (IOVBASE_TYPE *) request->u.gather.gather_buf;
 213         opal_convertor_pack(&request->u.gather.send_converter, &iov, &iov_count, &max_data);
 214 
 215         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 216                             "%s:%d:vrank(%d): root - gather_buf(%p) - gather_bytes(%lu)=packed_size(%ld) * size(%d)",
 217                             __FILE__, __LINE__, vrank,
 218                             request->u.gather.gather_buf, request->u.gather.gather_bytes,
 219                             request->u.gather.packed_size, request->u.gather.size);
 220     } else if (bmtree->tree_nextsize) {
 221         /*
 222          * other non-leaf nodes, allocate temp buffer to receive data from
 223          * children.  we need space for data from tree_numdescendants + 1
 224          * processes.
 225          */
 226         request->u.gather.gather_bytes=request->u.gather.packed_size * ((ptrdiff_t)bmtree->tree_numdescendants + 1);
 227 
 228         request->u.gather.gather_buf = (char *) malloc(request->u.gather.gather_bytes);
 229         if (NULL == request->u.gather.gather_buf) {
 230             ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
 231         }
 232         request->u.gather.free_after = 1;
 233 
 234         iov.iov_len = request->u.gather.gather_bytes;
 235         iov.iov_base = (IOVBASE_TYPE *) request->u.gather.gather_buf;
 236         opal_convertor_pack(&request->u.gather.send_converter, &iov, &iov_count, &max_data);
 237 
 238         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 239                             "%s:%d:vrank(%d): nonleaf - gather_buf(%p) - gather_bytes(%lu)=packed_size(%ld) * (bmtree->tree_numdescendants(%d) + 1)",
 240                             __FILE__, __LINE__, vrank,
 241                             request->u.gather.gather_buf, request->u.gather.gather_bytes,
 242                             request->u.gather.packed_size, bmtree->tree_numdescendants);
 243     } else {
 244         /* leaf nodes, allocate space to pack into and put from */
 245         request->u.gather.gather_bytes=request->u.gather.packed_size;
 246 
 247         request->u.gather.gather_buf = (char *) malloc(request->u.gather.gather_bytes);
 248         if (NULL == request->u.gather.gather_buf) {
 249             ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
 250         }
 251         request->u.gather.free_after = 1;
 252 
 253         iov.iov_len = request->u.gather.gather_bytes;
 254         iov.iov_base = (IOVBASE_TYPE *) request->u.gather.gather_buf;
 255         opal_convertor_pack(&request->u.gather.send_converter, &iov, &iov_count, &max_data);
 256 
 257         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 258                             "%s:%d:vrank(%d): leaf - gather_buf(%p) - gather_bytes(%lu)=packed_size(%ld)",
 259                             __FILE__, __LINE__, vrank,
 260                             request->u.gather.gather_buf, request->u.gather.gather_bytes,
 261                             request->u.gather.packed_size);
 262     }
 263 
 264     return OMPI_SUCCESS;
 265 
 266 err_hdlr:
 267     opal_output(ompi_coll_base_framework.framework_output,
 268                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 269                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 270 
 271     return ret;
 272 }
 273 
 274 static int
 275 setup_gather_buffers_linear(struct ompi_communicator_t   *comm,
 276                             ompi_coll_portals4_request_t *request,
 277                             mca_coll_portals4_module_t   *portals4_module)
 278 {
 279     int ret, line;
 280 
 281     uint32_t iov_count = 1;
 282     struct iovec iov;
 283     size_t max_data;
 284 
 285     int8_t i_am_root = (request->u.gather.my_rank == request->u.gather.root_rank);
 286 
 287     ompi_coll_portals4_create_send_converter (&request->u.gather.send_converter,
 288                                               request->u.gather.pack_src_buf + request->u.gather.pack_src_offset,
 289                                               ompi_comm_peer_lookup(comm, request->u.gather.my_rank),
 290                                               request->u.gather.pack_src_count,
 291                                               request->u.gather.pack_src_dtype);
 292     opal_convertor_get_packed_size(&request->u.gather.send_converter, &request->u.gather.packed_size);
 293 
 294     /**********************************/
 295     /* Setup Gather Buffers           */
 296     /**********************************/
 297     if (i_am_root) {
 298         request->u.gather.gather_bytes=request->u.gather.packed_size * (ptrdiff_t)request->u.gather.size;
 299 
 300         /*
 301          * root node, needs to allocate temp buffer to gather
 302          * packed bytes from all nodes including self.
 303          */
 304         request->u.gather.gather_buf = (char *) malloc(request->u.gather.gather_bytes);
 305         if (NULL == request->u.gather.gather_buf) {
 306             ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
 307         }
 308         request->u.gather.free_after = 1;
 309 
 310         /* pack local data into request->u.gather.gather_buf */
 311         uint64_t gather_buf_offset = (ptrdiff_t)request->u.gather.my_rank * request->u.gather.packed_size;
 312         iov.iov_len = request->u.gather.gather_bytes - gather_buf_offset;
 313         iov.iov_base = (IOVBASE_TYPE *) (request->u.gather.gather_buf + gather_buf_offset);
 314         opal_convertor_pack(&request->u.gather.send_converter, &iov, &iov_count, &max_data);
 315 
 316         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 317                             "%s:%d:rank(%d): root - gather_buf(%p) - gather_bytes(%lu)=packed_size(%ld) * size(%d)",
 318                             __FILE__, __LINE__, request->u.gather.my_rank,
 319                             request->u.gather.gather_buf, request->u.gather.gather_bytes,
 320                             request->u.gather.packed_size, request->u.gather.size);
 321     } else {
 322         /* non-root nodes, allocate space to pack into and put from */
 323         request->u.gather.gather_bytes=request->u.gather.packed_size;
 324         request->u.gather.gather_buf = (char *) malloc(request->u.gather.gather_bytes);
 325         if (NULL == request->u.gather.gather_buf) {
 326             ret = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
 327         }
 328         request->u.gather.free_after = 1;
 329 
 330         iov.iov_len = request->u.gather.gather_bytes;
 331         iov.iov_base = (IOVBASE_TYPE *) request->u.gather.gather_buf;
 332         opal_convertor_pack(&request->u.gather.send_converter, &iov, &iov_count, &max_data);
 333 
 334         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 335                             "%s:%d:rank(%d): leaf - gather_buf(%p) - gather_bytes(%lu)=packed_size(%ld)",
 336                             __FILE__, __LINE__, request->u.gather.my_rank,
 337                             request->u.gather.gather_buf, request->u.gather.gather_bytes,
 338                             request->u.gather.packed_size);
 339     }
 340 
 341     return OMPI_SUCCESS;
 342 
 343 err_hdlr:
 344     opal_output(ompi_coll_base_framework.framework_output,
 345                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 346                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 347 
 348     return ret;
 349 }
 350 
 351 static int
 352 setup_gather_handles(struct ompi_communicator_t   *comm,
 353                      ompi_coll_portals4_request_t *request,
 354                      mca_coll_portals4_module_t   *portals4_module)
 355 {
 356     int ret, line;
 357 
 358     ptl_me_t  me;
 359 
 360     /**********************************/
 361     /* Setup Gather Handles           */
 362     /**********************************/
 363     COLL_PORTALS4_SET_BITS(request->u.gather.gather_match_bits, ompi_comm_get_cid(comm),
 364             0, 0, COLL_PORTALS4_GATHER, 0, request->u.gather.coll_count);
 365 
 366     ret = PtlCTAlloc(mca_coll_portals4_component.ni_h,
 367                      &request->u.gather.gather_cth);
 368     if (PTL_OK != ret) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr; }
 369 
 370     request->u.gather.gather_mdh = mca_coll_portals4_component.data_md_h;
 371     request->u.gather.gather_offset = (ptl_size_t)request->u.gather.gather_buf;
 372 
 373     /* children put here */
 374     me.start = request->u.gather.gather_buf;
 375     me.length = request->u.gather.gather_bytes;
 376     me.ct_handle = request->u.gather.gather_cth;
 377     me.min_free = 0;
 378     me.uid = mca_coll_portals4_component.uid;
 379     me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 380         PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 381         PTL_ME_EVENT_CT_COMM;
 382     me.match_id.phys.nid = PTL_NID_ANY;
 383     me.match_id.phys.pid = PTL_PID_ANY;
 384     me.match_bits = request->u.gather.gather_match_bits;
 385     me.ignore_bits = 0;
 386     ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 387                       mca_coll_portals4_component.pt_idx,
 388                       &me,
 389                       PTL_PRIORITY_LIST,
 390                       NULL,
 391                       &request->u.gather.gather_meh);
 392     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 393 
 394     return OMPI_SUCCESS;
 395 
 396 err_hdlr:
 397     opal_output(ompi_coll_base_framework.framework_output,
 398                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 399                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 400 
 401     return ret;
 402 }
 403 
 404 static int
 405 setup_sync_handles(struct ompi_communicator_t   *comm,
 406                    ompi_coll_portals4_request_t *request,
 407                    mca_coll_portals4_module_t   *portals4_module)
 408 {
 409     int ret, line;
 410 
 411     ptl_me_t  me;
 412 
 413     /**********************************/
 414     /* Setup Sync Handles             */
 415     /**********************************/
 416     COLL_PORTALS4_SET_BITS(request->u.gather.sync_match_bits, ompi_comm_get_cid(comm),
 417             0, 1, COLL_PORTALS4_GATHER, 0, request->u.gather.coll_count);
 418 
 419     ret = PtlCTAlloc(mca_coll_portals4_component.ni_h,
 420                      &request->u.gather.sync_cth);
 421     if (PTL_OK != ret) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr; }
 422 
 423     request->u.gather.sync_mdh = mca_coll_portals4_component.zero_md_h;
 424 
 425     me.start = NULL;
 426     me.length = 0;
 427     me.ct_handle = request->u.gather.sync_cth;
 428     me.min_free = 0;
 429     me.uid = mca_coll_portals4_component.uid;
 430     me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 431         PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 432         PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW;
 433     me.match_id.phys.nid = PTL_NID_ANY;
 434     me.match_id.phys.pid = PTL_PID_ANY;
 435     me.match_bits = request->u.gather.sync_match_bits;
 436     me.ignore_bits = 0;
 437     ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 438                       mca_coll_portals4_component.pt_idx,
 439                       &me,
 440                       PTL_PRIORITY_LIST,
 441                       NULL,
 442                       &request->u.gather.sync_meh);
 443     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 444 
 445     return OMPI_SUCCESS;
 446 
 447 err_hdlr:
 448     opal_output(ompi_coll_base_framework.framework_output,
 449                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 450                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 451 
 452     return ret;
 453 }
 454 
 455 static int
 456 cleanup_gather_handles(ompi_coll_portals4_request_t *request)
 457 {
 458     int ret, line;
 459 
 460     /**********************************/
 461     /* Cleanup Gather Handles             */
 462     /**********************************/
 463     do {
 464         ret = PtlMEUnlink(request->u.gather.gather_meh);
 465     } while (PTL_IN_USE == ret);
 466     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 467 
 468     ret = PtlCTFree(request->u.gather.gather_cth);
 469     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 470 
 471     return OMPI_SUCCESS;
 472 
 473 err_hdlr:
 474     opal_output(ompi_coll_base_framework.framework_output,
 475                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 476                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 477 
 478     return ret;
 479 }
 480 
 481 static int
 482 cleanup_sync_handles(ompi_coll_portals4_request_t *request)
 483 {
 484     int ret, line;
 485 
 486     /**********************************/
 487     /* Cleanup Sync Handles             */
 488     /**********************************/
 489     do {
 490         ret = PtlMEUnlink(request->u.gather.sync_meh);
 491     } while (PTL_IN_USE == ret);
 492     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 493 
 494     ret = PtlCTFree(request->u.gather.sync_cth);
 495     if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 496 
 497     return OMPI_SUCCESS;
 498 
 499 err_hdlr:
 500     opal_output(ompi_coll_base_framework.framework_output,
 501                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 502                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 503 
 504     return ret;
 505 }
 506 
 507 static int
 508 ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
 509                                              void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
 510                                              int root,
 511                                              struct ompi_communicator_t *comm,
 512                                              ompi_coll_portals4_request_t *request,
 513                                              mca_coll_base_module_t *module)
 514 {
 515     mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
 516     int ret, line;
 517     ptl_ct_event_t ct;
 518 
 519     ptl_ct_event_t sync_incr_event;
 520 
 521     int vrank=-1;
 522 
 523     int32_t i=0;
 524 
 525     ompi_coll_portals4_tree_t* bmtree;
 526 
 527     int32_t expected_ops =0;
 528     int32_t expected_acks=0;
 529 
 530     ptl_size_t number_of_fragment_gathered = 0;
 531     ptl_size_t number_of_fragment_send = 1;
 532 
 533     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 534                  "coll:portals4:gather_intra_binomial_top enter rank %d", request->u.gather.my_rank));
 535 
 536     request->type = OMPI_COLL_PORTALS4_TYPE_GATHER;
 537     request->u.gather.gather_buf=NULL;
 538     request->u.gather.gather_mdh=PTL_INVALID_HANDLE;
 539     request->u.gather.gather_cth=PTL_INVALID_HANDLE;
 540     request->u.gather.gather_meh=PTL_INVALID_HANDLE;
 541     request->u.gather.sync_mdh=PTL_INVALID_HANDLE;
 542     request->u.gather.sync_cth=PTL_INVALID_HANDLE;
 543     request->u.gather.sync_meh=PTL_INVALID_HANDLE;
 544 
 545     request->u.gather.my_rank   = ompi_comm_rank(comm);
 546     request->u.gather.size      = ompi_comm_size(comm);
 547     request->u.gather.root_rank = root;
 548     request->u.gather.sbuf      = sbuf;
 549     request->u.gather.rbuf      = rbuf;
 550     if ((root == request->u.gather.my_rank) && (sbuf == MPI_IN_PLACE)) {
 551         request->u.gather.pack_src_buf   = rbuf;
 552         request->u.gather.pack_src_count = rcount;
 553         request->u.gather.pack_src_dtype = rdtype;
 554     } else {
 555         request->u.gather.pack_src_buf    = sbuf;
 556         request->u.gather.pack_src_count  = scount;
 557         request->u.gather.pack_src_dtype  = sdtype;
 558         request->u.gather.pack_src_offset = 0;
 559     }
 560     ompi_datatype_get_extent(request->u.gather.pack_src_dtype,
 561                              &request->u.gather.pack_src_lb,
 562                              &request->u.gather.pack_src_extent);
 563     ompi_datatype_get_true_extent(request->u.gather.pack_src_dtype,
 564                                   &request->u.gather.pack_src_true_lb,
 565                                   &request->u.gather.pack_src_true_extent);
 566     request->u.gather.unpack_dst_buf   = rbuf;
 567     request->u.gather.unpack_dst_count = rcount;
 568     request->u.gather.unpack_dst_dtype = rdtype;
 569     ompi_datatype_get_extent(request->u.gather.unpack_dst_dtype,
 570                              &request->u.gather.unpack_dst_lb,
 571                              &request->u.gather.unpack_dst_extent);
 572     ompi_datatype_get_true_extent(request->u.gather.unpack_dst_dtype,
 573                                   &request->u.gather.unpack_dst_true_lb,
 574                                   &request->u.gather.unpack_dst_true_extent);
 575 
 576     if ((root == request->u.gather.my_rank) && (sbuf == MPI_IN_PLACE)) {
 577         request->u.gather.pack_src_offset = request->u.gather.pack_src_extent * request->u.gather.pack_src_count * request->u.gather.my_rank;
 578     }
 579 
 580     opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 581                         "%s:%d:vrank(%d): request->u.gather.pack_src_offset(%lu)",
 582                         __FILE__, __LINE__, vrank,
 583                         request->u.gather.pack_src_offset);
 584 
 585     /**********************************/
 586     /* Setup Common Parameters        */
 587     /**********************************/
 588 
 589     request->u.gather.coll_count = opal_atomic_add_fetch_size_t(&portals4_module->coll_count, 1);
 590 
 591     COLL_PORTALS4_UPDATE_IN_ORDER_BMTREE( comm, portals4_module, request->u.gather.root_rank );
 592     bmtree = portals4_module->cached_in_order_bmtree;
 593 
 594     vrank = VRANK(request->u.gather.my_rank, request->u.gather.root_rank, request->u.gather.size);
 595 
 596     ret = setup_gather_buffers_binomial(comm, request, portals4_module);
 597     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 598 
 599     ret = setup_gather_handles(comm, request, portals4_module);
 600     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 601 
 602     ret = setup_sync_handles(comm, request, portals4_module);
 603     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 604 
 605     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 606                          "%s:%d: packed_size=%lu, fragment_size=%lu",
 607                          __FILE__, __LINE__, request->u.gather.packed_size, mca_coll_portals4_component.ni_limits.max_msg_size));
 608 
 609     for (int i =0; i < bmtree->tree_nextsize; i++) {
 610         int child_vrank = VRANK(bmtree->tree_next[i], request->u.gather.root_rank, request->u.gather.size);
 611         int sub_tree_size = get_tree_numdescendants_of(comm, child_vrank) + 1;
 612         ptl_size_t local_number_of_fragment = ((sub_tree_size * request->u.gather.packed_size) + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size;
 613 
 614         OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 615                              "%s:%d: %d is child of %d(%d) with %d descendants (nb_frag += %lu)",
 616                              __FILE__, __LINE__, bmtree->tree_next[i], vrank, request->u.gather.root_rank , sub_tree_size, local_number_of_fragment));
 617         number_of_fragment_gathered += local_number_of_fragment;
 618     }
 619 
 620     number_of_fragment_send = (request->u.gather.gather_bytes + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size;
 621 
 622     /***********************************************/
 623     /* Chain the RTR and Recv-ACK to the Gather CT */
 624     /***********************************************/
 625     if (vrank != 0) {
 626         sync_incr_event.success=1;
 627         sync_incr_event.failure=0;
 628         ret = PtlTriggeredCTInc(request->u.gather.gather_cth,
 629                                 sync_incr_event,
 630                                 request->u.gather.sync_cth,
 631                                 1);
 632         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 633         ret = PtlTriggeredCTInc(request->u.gather.gather_cth,
 634                                 sync_incr_event,
 635                                 request->u.gather.sync_cth,
 636                                 2);
 637         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 638     }
 639 
 640     /**********************************/
 641     /* do the gather                  */
 642     /**********************************/
 643     if (vrank == 0) {
 644         /* root, so do nothing */
 645 
 646         expected_ops=number_of_fragment_gathered ; /* gather put from each child */
 647         expected_acks=0;
 648 
 649     } else {
 650         int32_t parent = bmtree->tree_prev;
 651         int32_t vparent = VRANK(parent, request->u.gather.root_rank, request->u.gather.size);
 652 
 653         ptl_size_t remote_offset=(vrank-vparent) * request->u.gather.packed_size;
 654 
 655         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 656                             "%s:%d:vrank(%d): remote_offset(%lu)=(vrank(%d)-vparent(%d)) * packed_size(%ld)",
 657                             __FILE__, __LINE__, vrank,
 658                             remote_offset, vrank, vparent, request->u.gather.packed_size);
 659 
 660         expected_ops=number_of_fragment_gathered + 1; /* gather puts from each child + a chained RTR */
 661         expected_acks=1;                        /* Recv-ACK from parent */
 662 
 663         ptl_size_t size_sent = 0;
 664         ptl_size_t size_left = request->u.gather.gather_bytes;
 665 
 666         for (ptl_size_t i = 0 ; i < number_of_fragment_send; i++) {
 667             ptl_size_t frag_size = (size_left > mca_coll_portals4_component.ni_limits.max_msg_size) ?
 668                 mca_coll_portals4_component.ni_limits.max_msg_size:
 669                 size_left;
 670             ret = PtlTriggeredPut(request->u.gather.gather_mdh,
 671                               request->u.gather.gather_offset + size_sent,
 672                               frag_size,
 673                               PTL_NO_ACK_REQ,
 674                               ompi_coll_portals4_get_peer(comm, parent),
 675                               mca_coll_portals4_component.pt_idx,
 676                               request->u.gather.gather_match_bits,
 677                               remote_offset + size_sent,
 678                               NULL,
 679                               0,
 680                               request->u.gather.gather_cth,
 681                               expected_ops);
 682             if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 683             size_left -= frag_size;
 684             size_sent += frag_size;
 685         }
 686     }
 687 
 688     /************************************/
 689     /* put Recv-ACK to each child       */
 690     /************************************/
 691     for (i=0;i<bmtree->tree_nextsize;i++) {
 692         int32_t child=bmtree->tree_next[i];
 693         ret = PtlTriggeredPut(request->u.gather.sync_mdh,
 694                               0,
 695                               0,
 696                               PTL_NO_ACK_REQ,
 697                               ompi_coll_portals4_get_peer(comm, child),
 698                               mca_coll_portals4_component.pt_idx,
 699                               request->u.gather.sync_match_bits,
 700                               0,
 701                               NULL,
 702                               0,
 703                               request->u.gather.gather_cth,
 704                               expected_ops);
 705         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 706     }
 707 
 708     expected_ops+=expected_acks;
 709 
 710     if (!request->u.gather.is_sync) {
 711         /******************************************/
 712         /* put to finish pt when all ops complete */
 713         /******************************************/
 714         ret = PtlTriggeredPut(mca_coll_portals4_component.zero_md_h,
 715                 0,
 716                 0,
 717                 PTL_NO_ACK_REQ,
 718                 ompi_coll_portals4_get_peer(comm, request->u.gather.my_rank),
 719                 mca_coll_portals4_component.finish_pt_idx,
 720                 0,
 721                 0,
 722                 NULL,
 723                 (uintptr_t) request,
 724                 request->u.gather.gather_cth,
 725                 expected_ops);
 726         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 727     }
 728 
 729 #ifdef RTR_USES_TRIGGERED_PUT
 730     /**********************************/
 731     /* put RTR to each child          */
 732     /**********************************/
 733     for (i=0;i<bmtree->tree_nextsize;i++) {
 734         int32_t child=bmtree->tree_next[i];
 735         ret = PtlTriggeredPut(request->u.gather.sync_mdh,
 736                               0,
 737                               0,
 738                               PTL_NO_ACK_REQ,
 739                               ompi_coll_portals4_get_peer(comm, child),
 740                               mca_coll_portals4_component.pt_idx,
 741                               request->u.gather.sync_match_bits,
 742                               0,
 743                               NULL,
 744                               0,
 745                               request->u.gather.sync_cth,
 746                               0);
 747         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 748     }
 749 #else
 750     /**********************************/
 751     /* put RTR to each child          */
 752     /**********************************/
 753     for (i=0;i<bmtree->tree_nextsize;i++) {
 754         int32_t child=bmtree->tree_next[i];
 755         ret = PtlPut(request->u.gather.sync_mdh,
 756                      0,
 757                      0,
 758                      PTL_NO_ACK_REQ,
 759                      ompi_coll_portals4_get_peer(comm, child),
 760                      mca_coll_portals4_component.pt_idx,
 761                      request->u.gather.sync_match_bits,
 762                      0,
 763                      NULL,
 764                      0);
 765         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 766     }
 767 #endif
 768 
 769     if (request->u.gather.is_sync) {
 770         opal_output_verbose(10, ompi_coll_base_framework.framework_output,
 771                             "%s:%d:vrank(%d): calling CTWait(expected_ops=%d)\n",
 772                             __FILE__, __LINE__, vrank, expected_ops);
 773 
 774         /********************************/
 775         /* Wait for all ops to complete */
 776         /********************************/
 777         ret = PtlCTWait(request->u.gather.gather_cth, expected_ops, &ct);
 778         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 779 
 780         opal_output_verbose(10, ompi_coll_base_framework.framework_output,
 781                             "%s:%d:vrank(%d): completed CTWait(expected_ops=%d)\n",
 782                             __FILE__, __LINE__, vrank, expected_ops);
 783     }
 784 
 785     ompi_coll_portals4_destroy_tree(&(portals4_module->cached_in_order_bmtree));
 786 
 787     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 788                  "coll:portals4:gather_intra_binomial_top exit rank %d", request->u.gather.my_rank));
 789 
 790     return OMPI_SUCCESS;
 791 
 792 err_hdlr:
 793     if (NULL != request->u.gather.gather_buf)
 794         free(request->u.gather.gather_buf);
 795 
 796     ompi_coll_portals4_destroy_tree(&(portals4_module->cached_in_order_bmtree));
 797 
 798     opal_output(ompi_coll_base_framework.framework_output,
 799                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
 800                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
 801 
 802     return ret;
 803 }
 804 
 805 static int
 806 ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
 807                                            void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
 808                                            int root,
 809                                            struct ompi_communicator_t *comm,
 810                                            ompi_coll_portals4_request_t *request,
 811                                            mca_coll_base_module_t *module)
 812 {
 813     mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
 814     int ret, line;
 815     ptl_ct_event_t ct;
 816 
 817     ptl_ct_event_t sync_incr_event;
 818 
 819     int8_t i_am_root;
 820 
 821     int32_t i=0;
 822 
 823     int32_t expected_ops =0;
 824     int32_t expected_acks=0;
 825 
 826     ptl_size_t number_of_fragment = 1;
 827 
 828     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 829                  "coll:portals4:gather_intra_linear_top enter rank %d", request->u.gather.my_rank));
 830 
 831     request->type = OMPI_COLL_PORTALS4_TYPE_GATHER;
 832     request->u.gather.gather_buf=NULL;
 833     request->u.gather.gather_mdh=PTL_INVALID_HANDLE;
 834     request->u.gather.gather_cth=PTL_INVALID_HANDLE;
 835     request->u.gather.gather_meh=PTL_INVALID_HANDLE;
 836     request->u.gather.sync_mdh=PTL_INVALID_HANDLE;
 837     request->u.gather.sync_cth=PTL_INVALID_HANDLE;
 838     request->u.gather.sync_meh=PTL_INVALID_HANDLE;
 839 
 840     request->u.gather.my_rank   = ompi_comm_rank(comm);
 841     request->u.gather.size      = ompi_comm_size(comm);
 842     request->u.gather.root_rank = root;
 843     request->u.gather.sbuf      = sbuf;
 844     request->u.gather.rbuf      = rbuf;
 845     if ((root == request->u.gather.my_rank) && (sbuf == MPI_IN_PLACE)) {
 846         request->u.gather.pack_src_buf   = rbuf;
 847         request->u.gather.pack_src_count = rcount;
 848         request->u.gather.pack_src_dtype = rdtype;
 849     } else {
 850         request->u.gather.pack_src_buf    = sbuf;
 851         request->u.gather.pack_src_count  = scount;
 852         request->u.gather.pack_src_dtype  = sdtype;
 853         request->u.gather.pack_src_offset = 0;
 854     }
 855     ompi_datatype_get_extent(request->u.gather.pack_src_dtype,
 856                              &request->u.gather.pack_src_lb,
 857                              &request->u.gather.pack_src_extent);
 858     ompi_datatype_get_true_extent(request->u.gather.pack_src_dtype,
 859                                   &request->u.gather.pack_src_true_lb,
 860                                   &request->u.gather.pack_src_true_extent);
 861     request->u.gather.unpack_dst_buf   = rbuf;
 862     request->u.gather.unpack_dst_count = rcount;
 863     request->u.gather.unpack_dst_dtype = rdtype;
 864     ompi_datatype_get_extent(request->u.gather.unpack_dst_dtype,
 865                              &request->u.gather.unpack_dst_lb,
 866                              &request->u.gather.unpack_dst_extent);
 867     ompi_datatype_get_true_extent(request->u.gather.unpack_dst_dtype,
 868                                   &request->u.gather.unpack_dst_true_lb,
 869                                   &request->u.gather.unpack_dst_true_extent);
 870 
 871     if ((root == request->u.gather.my_rank) && (sbuf == MPI_IN_PLACE)) {
 872         request->u.gather.pack_src_offset = request->u.gather.pack_src_extent * request->u.gather.pack_src_count * request->u.gather.my_rank;
 873     }
 874 
 875     opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 876                         "%s:%d:rank(%d): request->u.gather.pack_src_offset(%lu)",
 877                         __FILE__, __LINE__, request->u.gather.my_rank,
 878                         request->u.gather.pack_src_offset);
 879 
 880     /**********************************/
 881     /* Setup Common Parameters        */
 882     /**********************************/
 883 
 884     i_am_root = (request->u.gather.my_rank == request->u.gather.root_rank);
 885 
 886     request->u.gather.coll_count = opal_atomic_add_fetch_size_t(&portals4_module->coll_count, 1);
 887 
 888     ret = setup_gather_buffers_linear(comm, request, portals4_module);
 889     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 890 
 891     ret = setup_gather_handles(comm, request, portals4_module);
 892     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 893 
 894     ret = setup_sync_handles(comm, request, portals4_module);
 895     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
 896 
 897     number_of_fragment = (request->u.gather.packed_size > mca_coll_portals4_component.ni_limits.max_msg_size) ?
 898         (request->u.gather.packed_size + mca_coll_portals4_component.ni_limits.max_msg_size - 1) / mca_coll_portals4_component.ni_limits.max_msg_size :
 899         1;
 900     opal_output_verbose(90, ompi_coll_base_framework.framework_output,
 901         "%s:%d:rank %d:number_of_fragment = %lu",
 902         __FILE__, __LINE__, request->u.gather.my_rank, number_of_fragment);
 903 
 904     /***********************************************/
 905     /* Chain the RTR and Recv-ACK to the Gather CT */
 906     /***********************************************/
 907     if (!i_am_root) {
 908         sync_incr_event.success=1;
 909         sync_incr_event.failure=0;
 910         ret = PtlTriggeredCTInc(request->u.gather.gather_cth,
 911                                 sync_incr_event,
 912                                 request->u.gather.sync_cth,
 913                                 1);
 914         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 915         ret = PtlTriggeredCTInc(request->u.gather.gather_cth,
 916                                 sync_incr_event,
 917                                 request->u.gather.sync_cth,
 918                                 2);
 919         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 920     }
 921 
 922     /**********************************/
 923     /* do the gather                  */
 924     /**********************************/
 925     if (i_am_root) {
 926         /* root, so do nothing */
 927 
 928         expected_ops=(request->u.gather.size-1) * number_of_fragment; /* gather put from all other ranks */
 929         expected_acks=0;
 930 
 931     } else {
 932         ptl_size_t remote_offset=request->u.gather.my_rank * request->u.gather.packed_size;
 933         ptl_size_t split_offset = 0;
 934         ptl_size_t size_left = request->u.gather.gather_bytes;
 935 
 936         opal_output_verbose(30, ompi_coll_base_framework.framework_output,
 937                             "%s:%d:rank(%d): remote_offset(%lu)=rank(%d) * packed_size(%ld)",
 938                             __FILE__, __LINE__, request->u.gather.my_rank,
 939                             remote_offset, request->u.gather.my_rank, request->u.gather.packed_size);
 940 
 941         expected_ops=1;  /* chained RTR */
 942         expected_acks=1; /* Recv-ACK from root */
 943 
 944         for (ptl_size_t j=0; j<number_of_fragment; j++) {
 945 
 946             ptl_size_t frag_size = (size_left >  mca_coll_portals4_component.ni_limits.max_msg_size) ?
 947                 mca_coll_portals4_component.ni_limits.max_msg_size :
 948                 size_left;
 949 
 950             opal_output_verbose(10, ompi_coll_base_framework.framework_output,
 951                 "%s:%d:rank(%d): frag(%lu),offset_frag (%lu) frag_size(%lu)",
 952                 __FILE__, __LINE__, request->u.gather.my_rank,
 953                 j, split_offset, frag_size);
 954 
 955             ret = PtlTriggeredPut(request->u.gather.gather_mdh,
 956                               request->u.gather.gather_offset + split_offset,
 957                               frag_size,
 958                               PTL_NO_ACK_REQ,
 959                               ompi_coll_portals4_get_peer(comm, request->u.gather.root_rank),
 960                               mca_coll_portals4_component.pt_idx,
 961                               request->u.gather.gather_match_bits,
 962                               remote_offset + split_offset,
 963                               NULL,
 964                               0,
 965                               request->u.gather.gather_cth,
 966                               expected_ops);
 967             if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 968 
 969             size_left -= frag_size;
 970             split_offset += frag_size;
 971         }
 972     }
 973 
 974     /*****************************************/
 975     /* root puts Recv-ACK to all other ranks */
 976     /*****************************************/
 977     if (i_am_root) {
 978         for (i=0;i<request->u.gather.size;i++) {
 979             if (i == request->u.gather.root_rank) { continue; }
 980             ret = PtlTriggeredPut(request->u.gather.sync_mdh,
 981                                   0,
 982                                   0,
 983                                   PTL_NO_ACK_REQ,
 984                                   ompi_coll_portals4_get_peer(comm, i),
 985                                   mca_coll_portals4_component.pt_idx,
 986                                   request->u.gather.sync_match_bits,
 987                                   0,
 988                                   NULL,
 989                                   0,
 990                                   request->u.gather.gather_cth,
 991                                   expected_ops);
 992             if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
 993         }
 994     }
 995 
 996     expected_ops+=expected_acks;
 997 
 998     if (!request->u.gather.is_sync) {
 999         /******************************************/
1000         /* put to finish pt when all ops complete */
1001         /******************************************/
1002         ret = PtlTriggeredPut(mca_coll_portals4_component.zero_md_h,
1003                 0,
1004                 0,
1005                 PTL_NO_ACK_REQ,
1006                 ompi_coll_portals4_get_peer(comm, request->u.gather.my_rank),
1007                 mca_coll_portals4_component.finish_pt_idx,
1008                 0,
1009                 0,
1010                 NULL,
1011                 (uintptr_t) request,
1012                 request->u.gather.gather_cth,
1013                 expected_ops);
1014         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
1015     }
1016 
1017 #ifdef RTR_USES_TRIGGERED_PUT
1018     /************************************/
1019     /* root puts RTR to all other ranks */
1020     /************************************/
1021     if (i_am_root) {
1022         for (i=0;i<request->u.gather.size;i++) {
1023             if (i == request->u.gather.root_rank) { continue; }
1024             ret = PtlTriggeredPut(request->u.gather.sync_mdh,
1025                                   0,
1026                                   0,
1027                                   PTL_NO_ACK_REQ,
1028                                   ompi_coll_portals4_get_peer(comm, i),
1029                                   mca_coll_portals4_component.pt_idx,
1030                                   request->u.gather.sync_match_bits,
1031                                   0,
1032                                   NULL,
1033                                   0,
1034                                   request->u.gather.sync_cth,
1035                                   0);
1036             if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
1037         }
1038     }
1039 #else
1040     /************************************/
1041     /* root puts RTR to all other ranks */
1042     /************************************/
1043     if (i_am_root) {
1044         for (i=0;i<request->u.gather.size;i++) {
1045             if (i == request->u.gather.root_rank) { continue; }
1046             ret = PtlPut(request->u.gather.sync_mdh,
1047                          0,
1048                          0,
1049                          PTL_NO_ACK_REQ,
1050                          ompi_coll_portals4_get_peer(comm, i),
1051                          mca_coll_portals4_component.pt_idx,
1052                          request->u.gather.sync_match_bits,
1053                          0,
1054                          NULL,
1055                          0);
1056             if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
1057         }
1058     }
1059 #endif
1060 
1061     if (request->u.gather.is_sync) {
1062         opal_output_verbose(10, ompi_coll_base_framework.framework_output,
1063                 "calling CTWait(expected_ops=%d)\n", expected_ops);
1064 
1065         /********************************/
1066         /* Wait for all ops to complete */
1067         /********************************/
1068         ret = PtlCTWait(request->u.gather.gather_cth, expected_ops, &ct);
1069         if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
1070 
1071         opal_output_verbose(10, ompi_coll_base_framework.framework_output,
1072                 "completed CTWait(expected_ops=%d)\n", expected_ops);
1073     }
1074 
1075     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1076                  "coll:portals4:gather_intra_linear_top exit rank %d", request->u.gather.my_rank));
1077 
1078     return OMPI_SUCCESS;
1079 
1080 err_hdlr:
1081     if (NULL != request->u.gather.gather_buf)
1082         free(request->u.gather.gather_buf);
1083 
1084     opal_output(ompi_coll_base_framework.framework_output,
1085                 "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1086                 __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1087 
1088     return ret;
1089 }
1090 
1091 static int
1092 ompi_coll_portals4_gather_intra_binomial_bottom(struct ompi_communicator_t *comm,
1093                                                 ompi_coll_portals4_request_t *request)
1094 {
1095     int ret, line;
1096     int i;
1097 
1098     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1099                  "coll:portals4:gather_intra_binomial_bottom enter rank %d", request->u.gather.my_rank));
1100 
1101     ret = cleanup_gather_handles(request);
1102     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1103 
1104     ret = cleanup_sync_handles(request);
1105     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1106 
1107     if (request->u.gather.my_rank == request->u.gather.root_rank) {
1108         uint32_t iov_count = 1;
1109         struct iovec iov;
1110         size_t max_data;
1111 
1112         for (i=0;i<request->u.gather.size;i++) {
1113             uint64_t offset = request->u.gather.unpack_dst_extent * request->u.gather.unpack_dst_count * ((request->u.gather.my_rank + i) % request->u.gather.size);
1114 
1115             opal_output_verbose(30, ompi_coll_base_framework.framework_output,
1116                                 "%s:%d:rank(%d): offset(%lu)",
1117                                 __FILE__, __LINE__, request->u.gather.my_rank,
1118                                 offset);
1119 
1120             ompi_coll_portals4_create_recv_converter (&request->u.gather.recv_converter,
1121                                                       request->u.gather.unpack_dst_buf + offset,
1122                                                       ompi_comm_peer_lookup(comm, request->u.gather.my_rank),
1123                                                       request->u.gather.unpack_dst_count,
1124                                                       request->u.gather.unpack_dst_dtype);
1125 
1126             iov.iov_len = request->u.gather.packed_size;
1127             iov.iov_base = (IOVBASE_TYPE *) ((char *)request->u.gather.gather_buf + (request->u.gather.packed_size*i));
1128             opal_convertor_unpack(&request->u.gather.recv_converter, &iov, &iov_count, &max_data);
1129 
1130             OBJ_DESTRUCT(&request->u.gather.recv_converter);
1131         }
1132     }
1133 
1134     if (request->u.gather.free_after)
1135         free(request->u.gather.gather_buf);
1136 
1137     request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
1138 
1139     ompi_request_complete(&request->super, true);
1140 
1141     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1142                  "coll:portals4:gather_intra_binomial_bottom exit rank %d", request->u.gather.my_rank));
1143 
1144     return OMPI_SUCCESS;
1145 
1146 err_hdlr:
1147     request->super.req_status.MPI_ERROR = ret;
1148 
1149     if (request->u.gather.free_after)
1150         free(request->u.gather.gather_buf);
1151 
1152     opal_output(ompi_coll_base_framework.framework_output,
1153             "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1154             __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1155 
1156     return ret;
1157 }
1158 
1159 static int
1160 ompi_coll_portals4_gather_intra_linear_bottom(struct ompi_communicator_t *comm,
1161                                               ompi_coll_portals4_request_t *request)
1162 {
1163     int ret, line;
1164     int i;
1165 
1166     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1167                  "coll:portals4:gather_intra_linear_bottom enter rank %d", request->u.gather.my_rank));
1168 
1169     ret = cleanup_gather_handles(request);
1170     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1171 
1172     ret = cleanup_sync_handles(request);
1173     if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1174 
1175     if (request->u.gather.my_rank == request->u.gather.root_rank) {
1176         uint32_t iov_count = 1;
1177         struct iovec iov;
1178         size_t max_data;
1179 
1180         for (i=0;i<request->u.gather.size;i++) {
1181             ompi_coll_portals4_create_recv_converter (&request->u.gather.recv_converter,
1182                                                       request->u.gather.unpack_dst_buf + (request->u.gather.unpack_dst_extent*request->u.gather.unpack_dst_count*i),
1183                                                       ompi_comm_peer_lookup(comm, request->u.gather.my_rank),
1184                                                       request->u.gather.unpack_dst_count,
1185                                                       request->u.gather.unpack_dst_dtype);
1186 
1187             iov.iov_len = request->u.gather.packed_size;
1188             iov.iov_base = (IOVBASE_TYPE *) ((char *)request->u.gather.gather_buf + (request->u.gather.packed_size*i));
1189             opal_convertor_unpack(&request->u.gather.recv_converter, &iov, &iov_count, &max_data);
1190 
1191             OBJ_DESTRUCT(&request->u.gather.recv_converter);
1192         }
1193     }
1194 
1195     if (request->u.gather.free_after)
1196         free(request->u.gather.gather_buf);
1197 
1198     request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
1199 
1200     ompi_request_complete(&request->super, true);
1201 
1202     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1203                  "coll:portals4:gather_intra_linear_bottom exit rank %d", request->u.gather.my_rank));
1204 
1205     return OMPI_SUCCESS;
1206 
1207 err_hdlr:
1208     request->super.req_status.MPI_ERROR = ret;
1209 
1210     if (request->u.gather.free_after)
1211         free(request->u.gather.gather_buf);
1212 
1213     opal_output(ompi_coll_base_framework.framework_output,
1214             "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1215             __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1216 
1217     return ret;
1218 }
1219 
1220 int
1221 ompi_coll_portals4_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
1222                                 void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
1223                                 int root,
1224                                 struct ompi_communicator_t *comm,
1225                                 mca_coll_base_module_t *module)
1226 {
1227     int ret, line;
1228 
1229     ompi_coll_portals4_request_t *request;
1230 
1231     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1232                  "coll:portals4:gather_intra enter rank %d", ompi_comm_rank(comm)));
1233 
1234     /*
1235      *  allocate a portals4 request
1236      */
1237     OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request);
1238     if (NULL == request) {
1239         ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
1240     }
1241     request->u.gather.is_sync = 1;
1242 
1243     /*
1244      *  initiate the gather
1245      *
1246      *  this request is marked synchronous (is_sync==1), so PtlCTWait()
1247      *  will be called to wait for completion.
1248      */
1249     if (1 == mca_coll_portals4_component.use_binomial_gather_algorithm) {
1250         ret = ompi_coll_portals4_gather_intra_binomial_top(sbuf, scount, sdtype,
1251                                                            rbuf, rcount, rdtype,
1252                                                            root,
1253                                                            comm,
1254                                                            request,
1255                                                            module);
1256         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1257 
1258         ret = ompi_coll_portals4_gather_intra_binomial_bottom(comm, request);
1259         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1260     } else {
1261         ret = ompi_coll_portals4_gather_intra_linear_top(sbuf, scount, sdtype,
1262                                                          rbuf, rcount, rdtype,
1263                                                          root,
1264                                                          comm,
1265                                                          request,
1266                                                          module);
1267         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1268 
1269         ret = ompi_coll_portals4_gather_intra_linear_bottom(comm, request);
1270         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1271     }
1272 
1273     /*
1274      *  return the portals4 request
1275      */
1276     OMPI_COLL_PORTALS4_REQUEST_RETURN(request);
1277 
1278     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1279                  "coll:portals4:gather_intra exit rank %d", request->u.gather.my_rank));
1280 
1281     return OMPI_SUCCESS;
1282 
1283 err_hdlr:
1284     opal_output(ompi_coll_base_framework.framework_output,
1285             "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1286             __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1287 
1288     return ret;
1289 }
1290 
1291 
1292 int
1293 ompi_coll_portals4_igather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
1294                                  void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
1295                                  int root,
1296                                  struct ompi_communicator_t *comm,
1297                                  ompi_request_t **ompi_request,
1298                                  mca_coll_base_module_t *module)
1299 {
1300     int ret, line;
1301 
1302     ompi_coll_portals4_request_t *request;
1303 
1304     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1305                  "coll:portals4:igather_intra enter rank %d", ompi_comm_rank(comm)));
1306 
1307     /*
1308      *  allocate a portals4 request
1309      */
1310     OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request);
1311     if (NULL == request) {
1312         ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr;
1313     }
1314     *ompi_request = &request->super;
1315     request->u.gather.is_sync = 0;
1316 
1317     /*
1318      *  initiate the gather
1319      *
1320      *  this request is marked asynchronous (is_sync==0), so
1321      *  portals4_progress() will handle completion.
1322      */
1323     if (1 == mca_coll_portals4_component.use_binomial_gather_algorithm) {
1324         ret = ompi_coll_portals4_gather_intra_binomial_top(sbuf, scount, sdtype,
1325                                                            rbuf, rcount, rdtype,
1326                                                            root,
1327                                                            comm,
1328                                                            request,
1329                                                            module);
1330         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1331     } else {
1332         ret = ompi_coll_portals4_gather_intra_linear_top(sbuf, scount, sdtype,
1333                                                          rbuf, rcount, rdtype,
1334                                                          root,
1335                                                          comm,
1336                                                          request,
1337                                                          module);
1338         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1339     }
1340 
1341     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1342                  "coll:portals4:igather_intra exit rank %d", request->u.gather.my_rank));
1343 
1344     return OMPI_SUCCESS;
1345 
1346 err_hdlr:
1347     opal_output(ompi_coll_base_framework.framework_output,
1348             "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1349             __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1350 
1351     return ret;
1352 }
1353 
1354 
1355 int
1356 ompi_coll_portals4_igather_intra_fini(ompi_coll_portals4_request_t *request)
1357 {
1358     int ret, line;
1359 
1360     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1361                  "coll:portals4:igather_intra_fini enter rank %d", request->u.gather.my_rank));
1362 
1363     /*
1364      *  cleanup the gather
1365      */
1366     if (1 == mca_coll_portals4_component.use_binomial_gather_algorithm) {
1367         ret = ompi_coll_portals4_gather_intra_binomial_bottom(request->super.req_mpi_object.comm, request);
1368         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1369     } else {
1370         ret = ompi_coll_portals4_gather_intra_linear_bottom(request->super.req_mpi_object.comm, request);
1371         if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
1372     }
1373 
1374     OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
1375                  "coll:portals4:igather_intra_fini exit rank %d", request->u.gather.my_rank));
1376 
1377     return OMPI_SUCCESS;
1378 
1379 err_hdlr:
1380     opal_output(ompi_coll_base_framework.framework_output,
1381             "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
1382             __FILE__, __LINE__, line, ret, request->u.gather.my_rank);
1383 
1384     return ret;
1385 }

/* [<][>][^][v][top][bottom][index][help] */