root/ompi/mca/coll/portals4/coll_portals4_bcast.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. prepare_bcast_data
  2. post_bcast_data
  3. bcast_kary_tree_top
  4. bcast_pipeline_top
  5. bcast_kary_tree_bottom
  6. bcast_pipeline_bottom
  7. ompi_coll_portals4_bcast_intra
  8. ompi_coll_portals4_ibcast_intra
  9. ompi_coll_portals4_ibcast_intra_fini

   1 /*
   2  * Copyright (c) 2015      Sandia National Laboratories. All rights reserved.
   3  * Copyright (c) 2015      Bull SAS.  All rights reserved.
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  */
  10 
  11 #include "ompi_config.h"
  12 
  13 #include "coll_portals4.h"
  14 #include "coll_portals4_request.h"
  15 
  16 #include "mpi.h"
  17 #include "ompi/constants.h"
  18 #include "opal/util/bit_ops.h"
  19 #include "ompi/mca/pml/pml.h"
  20 #include "ompi/mca/coll/coll.h"
  21 #include "ompi/mca/coll/base/base.h"
  22 #include "ompi/datatype/ompi_datatype.h"
  23 
  24 /*
  25  * the bcast communication is based on 1 to N scheme
  26  *
  27  */
  28 
  29 #define COLL_PORTALS4_BCAST_MAX_CHILDREN    2
  30 #define COLL_PORTALS4_BCAST_ALGO_THRESHOLD      4
  31 
  32 
  33 static int prepare_bcast_data (struct ompi_communicator_t *comm,
  34         void *buff, int count,
  35         struct ompi_datatype_t *datatype, int root,
  36         ompi_coll_portals4_request_t *request) {
  37     int rank = ompi_comm_rank(comm);
  38     int ret;
  39     size_t max_data;
  40     unsigned int iov_count;
  41     struct iovec iovec;
  42 
  43     request->u.bcast.is_root = (rank == root);
  44     request->u.bcast.needs_pack = !ompi_datatype_is_contiguous_memory_layout(datatype, count);
  45 
  46     if (request->u.bcast.needs_pack) {
  47         if (request->u.bcast.is_root) {
  48             OBJ_CONSTRUCT(&request->u.bcast.convertor, opal_convertor_t);
  49             opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
  50                     &(datatype->super), count,
  51                     buff, 0, &request->u.bcast.convertor);
  52             opal_convertor_get_packed_size(&request->u.bcast.convertor, &request->u.bcast.tmpsize);
  53             request->u.bcast.tmpbuf = malloc(request->u.bcast.tmpsize);
  54             if (OPAL_UNLIKELY(NULL == request->u.bcast.tmpbuf)) {
  55                 OBJ_DESTRUCT(&request->u.bcast.convertor);
  56                 return opal_stderr("malloc failed", __FILE__, __LINE__, OMPI_ERR_OUT_OF_RESOURCE);
  57             }
  58 
  59             iovec.iov_base = request->u.bcast.tmpbuf;
  60             iovec.iov_len = request->u.bcast.tmpsize;
  61             iov_count = 1;
  62             max_data = request->u.bcast.tmpsize;
  63             ret = opal_convertor_pack(&request->u.bcast.convertor, &iovec, &iov_count, &max_data);
  64             OBJ_DESTRUCT(&request->u.bcast.convertor);
  65             if (OPAL_UNLIKELY(ret < 0)) {
  66                 return opal_stderr("opal_convertor_pack failed", __FILE__, __LINE__, ret);      }
  67         }
  68         else {
  69             OBJ_CONSTRUCT(&request->u.bcast.convertor, opal_convertor_t);
  70             opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
  71                     &(datatype->super), count,
  72                     buff, 0, &request->u.bcast.convertor);
  73 
  74             max_data = request->u.bcast.tmpsize;
  75             opal_convertor_get_packed_size(&request->u.bcast.convertor, &max_data);
  76 
  77             request->u.bcast.tmpbuf = malloc(request->u.bcast.tmpsize);
  78             if (OPAL_UNLIKELY(NULL == request->u.bcast.tmpbuf)) {
  79                 OBJ_DESTRUCT(&request->u.bcast.convertor);
  80                 return opal_stderr("malloc failed", __FILE__, __LINE__, OMPI_ERR_OUT_OF_RESOURCE);
  81             }
  82         }
  83     }
  84     else {
  85         request->u.bcast.tmpbuf = buff;
  86 
  87         ompi_datatype_type_size(datatype, &request->u.bcast.tmpsize);
  88         request->u.bcast.tmpsize *= count;
  89     }
  90 
  91     /* Number of segments */
  92     {
  93         size_t max_msg_size = (COLL_PORTALS4_MAX_BW >  mca_coll_portals4_component.ni_limits.max_msg_size) ?
  94             mca_coll_portals4_component.ni_limits.max_msg_size :
  95             COLL_PORTALS4_MAX_BW;
  96 
  97         //TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT
  98         request->u.bcast.segment_nb =  (request->u.bcast.tmpsize > max_msg_size) ?
  99             (((request->u.bcast.tmpsize + max_msg_size -1)  / max_msg_size) < COLL_PORTALS4_MAX_SEGMENT ?
 100                 ((request->u.bcast.tmpsize + max_msg_size -1)  / max_msg_size) : COLL_PORTALS4_MAX_SEGMENT) :
 101                     1;
 102 
 103         OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
 104                 "seg_number=%d , seg_size_max=%lu", request->u.bcast.segment_nb, max_msg_size));
 105     }
 106     if (request->u.bcast.segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD) {
 107         request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO;
 108     }
 109     else {
 110         request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO;
 111     }
 112     return (OMPI_SUCCESS);
 113 }
 114 
 115 static int post_bcast_data(     ompi_coll_portals4_request_t *request) {
 116 
 117     int ret;
 118     size_t max_data;
 119     unsigned int iov_count;
 120     struct iovec iovec;
 121 
 122     if (request->u.bcast.needs_pack) {
 123         if (!request->u.bcast.is_root) {
 124             opal_convertor_get_packed_size(&request->u.bcast.convertor, &request->u.bcast.tmpsize);
 125 
 126             iovec.iov_base = request->u.bcast.tmpbuf;
 127             iovec.iov_len = request->u.bcast.tmpsize;
 128             iov_count = 1;
 129             ret = opal_convertor_unpack(&request->u.bcast.convertor, &iovec, &iov_count, &max_data);
 130             OBJ_DESTRUCT(&request->u.bcast.convertor);
 131             if (OPAL_UNLIKELY(ret < 0)) {
 132                 return opal_stderr("opal_convertor_unpack failed", __FILE__, __LINE__, ret);
 133             }
 134         }
 135         free(request->u.bcast.tmpbuf);
 136     }
 137     return (OMPI_SUCCESS);
 138 }
 139 
 140 static int
 141 bcast_kary_tree_top(void *buff, int count,
 142         struct ompi_datatype_t *datatype, int root,
 143         struct ompi_communicator_t *comm,
 144         ompi_coll_portals4_request_t *request,
 145         mca_coll_portals4_module_t *portals4_module)
 146 {
 147     bool is_sync = request->is_sync;
 148     int ret;
 149     unsigned int i, seg, seg_size, nb_long;
 150     unsigned int segment_nb = request->u.bcast.segment_nb;
 151     unsigned int child_nb;
 152     int size = ompi_comm_size(comm);
 153     int rank = ompi_comm_rank(comm);
 154     ptl_rank_t parent, child[COLL_PORTALS4_BCAST_MAX_CHILDREN];
 155     size_t internal_count, length, offset;
 156     ptl_handle_md_t zero_md_h, data_md_h;
 157     ptl_handle_me_t me_h;
 158     ptl_ct_event_t ct_inc;
 159     ptl_me_t me;
 160     ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits;
 161     ptl_ct_event_t ct;
 162     ptl_size_t trig_thr, ack_thr;
 163 
 164     /*
 165      ** Initialization
 166      */
 167 
 168     request->type = OMPI_COLL_PORTALS4_TYPE_BCAST;
 169 
 170     for (i = 0 ; i < COLL_PORTALS4_BCAST_MAX_CHILDREN ; i++) {
 171         child[i] = PTL_INVALID_RANK;
 172     }
 173 
 174     parent = PTL_INVALID_RANK;
 175 
 176     zero_md_h = mca_coll_portals4_component.zero_md_h;
 177     data_md_h = mca_coll_portals4_component.data_md_h;
 178 
 179     internal_count = opal_atomic_add_fetch_size_t(&portals4_module->coll_count, 1);
 180 
 181 
 182     /*
 183      ** DATATYPE and SIZES
 184      */
 185 
 186     get_k_ary_tree(COLL_PORTALS4_BCAST_MAX_CHILDREN,
 187             rank, size, root, &parent, child, &child_nb);
 188     request->u.bcast.u.child_nb = child_nb;
 189 
 190     /*
 191      * TOPOLOGY
 192      */
 193 
 194     /*
 195      * PORTALS4 RESOURCE ALLOCATION
 196      */
 197 
 198     if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.trig_ct_h)) != 0) {
 199         return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 200     }
 201 
 202     /* Compute match bits */
 203     COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0,
 204             COLL_PORTALS4_BCAST, 0, internal_count);
 205 
 206     COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1,
 207             COLL_PORTALS4_BCAST, 0, internal_count);
 208 
 209     COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0,
 210             COLL_PORTALS4_BCAST, 0, internal_count);
 211 
 212     /* The data will be cut in segment_nb segments.
 213      * nb_long segments will have a size of (seg_size + 1)
 214      * and (segment_nb - nb_long) segments will have a size of seg_size
 215      */
 216     seg_size = request->u.bcast.tmpsize / segment_nb;
 217     nb_long = request->u.bcast.tmpsize % segment_nb;
 218     opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d segment_nb=%d", seg_size, nb_long, segment_nb);
 219 
 220     if (rank != root) {
 221         for (seg = 1, offset = 0, length = 0 ;
 222                 seg <= segment_nb ;
 223                 seg++, offset += length) {
 224 
 225             /* Divide buffer into segments */
 226             if (seg <= nb_long) length = seg_size + 1;
 227             else length = seg_size;
 228 
 229             /*
 230              ** Prepare Data ME
 231              */
 232 
 233             memset(&me, 0, sizeof(ptl_me_t));
 234             me.start = ((uint8_t*) request->u.bcast.tmpbuf) + offset;
 235             me.length = length;
 236             me.ct_handle = request->u.bcast.trig_ct_h;
 237             me.uid = mca_coll_portals4_component.uid;
 238             me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 239                     PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 240                     PTL_ME_USE_ONCE |
 241                     PTL_ME_EVENT_CT_COMM;
 242             me.match_id.phys.nid = PTL_NID_ANY;
 243             me.match_id.phys.pid = PTL_PID_ANY;
 244             me.match_bits = match_bits;
 245             me.ignore_bits = 0;
 246             if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 247                     mca_coll_portals4_component.pt_idx,
 248                     &me,
 249                     PTL_PRIORITY_LIST,
 250                     NULL,
 251                     &me_h)) != 0) {
 252                 return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 253             }
 254         }
 255 
 256         /*
 257          * Send RTR to parent
 258          *
 259          * the root does not to have to do it, since it does not have parent.
 260          * WE can do such an operation by now, since we are able to receive data,
 261          * even if we are not able to receive the others.
 262          *
 263          */
 264 
 265         /* and there, we only send the RTR when all the MEs are ready */
 266         if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 267                 ompi_coll_portals4_get_peer(comm, parent),
 268                 mca_coll_portals4_component.pt_idx, match_bits_rtr,
 269                 0, NULL, 0)) != PTL_OK) {
 270             return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret);
 271         }
 272 
 273         /*
 274          * Prepare Triggered Put to ACK Data to parent
 275          *
 276          */
 277 
 278         trig_thr = child_nb ? (segment_nb * 2) :
 279                 segment_nb;
 280 
 281         if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 282                 ompi_coll_portals4_get_peer(comm, parent),
 283                 mca_coll_portals4_component.pt_idx,
 284                 match_bits_ack, 0, NULL, 0,
 285                 request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 286             return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 287         }
 288     }
 289 
 290     if (child_nb) {
 291         if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.rtr_ct_h)) != 0) {
 292             return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 293         }
 294         ct_inc.success = segment_nb;
 295         ct_inc.failure = 0;
 296 
 297         if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 298                 request->u.bcast.rtr_ct_h, child_nb)) != 0) {
 299             return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 300         }
 301 
 302         if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.ack_ct_h)) != 0) {
 303             return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 304         }
 305 
 306         /*
 307          ** Prepare ME for receiving data ACK Put
 308          ** Priority List
 309          */
 310 
 311         memset(&me, 0, sizeof(ptl_me_t));
 312         me.start = NULL;
 313         me.length = 0;
 314         me.min_free = 0;
 315         me.uid = mca_coll_portals4_component.uid;
 316         me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 317                 PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 318                 PTL_ME_USE_ONCE |
 319                 PTL_ME_EVENT_CT_COMM;
 320         me.match_id.phys.nid = PTL_NID_ANY;
 321         me.match_id.phys.pid = PTL_PID_ANY;
 322         me.match_bits = match_bits_ack;
 323         me.ignore_bits = 0;
 324         me.ct_handle = request->u.bcast.ack_ct_h;
 325 
 326         for (i = 0 ; i < child_nb ; i++) {
 327             if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 328                     mca_coll_portals4_component.pt_idx,
 329                     &me, PTL_PRIORITY_LIST,  NULL,
 330                     &me_h)) != 0) {
 331                 return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 332             }
 333         }
 334 
 335         /*
 336          ** Prepare ME for sending RTR Put
 337          ** Priority List, match also with "Overflow list Me" in coll_portals4_component
 338          */
 339 
 340         memset(&me, 0, sizeof(ptl_me_t));
 341         me.start = NULL;
 342         me.length = 0;
 343         me.min_free = 0;
 344         me.uid = mca_coll_portals4_component.uid;
 345         me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 346                 PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 347                 PTL_ME_USE_ONCE |
 348                 PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW;
 349         me.match_id.phys.nid = PTL_NID_ANY;
 350         me.match_id.phys.pid = PTL_PID_ANY;
 351         me.match_bits = match_bits_rtr;
 352         me.ignore_bits = 0;
 353         me.ct_handle = request->u.bcast.rtr_ct_h;
 354 
 355         for (i = 0 ; i < child_nb ; i++) {
 356             if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 357                     mca_coll_portals4_component.pt_idx,
 358                     &me, PTL_PRIORITY_LIST,
 359                     NULL,
 360                     &me_h)) != 0) {
 361                 return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 362             }
 363         }
 364 
 365         for (seg = 1, offset = 0, length = 0 ;
 366                 seg <= segment_nb ;
 367                 seg++, offset += length) {
 368 
 369             /* Divide buffer into segments */
 370             if (seg <= nb_long) length = seg_size + 1;
 371             else length = seg_size;
 372             opal_output_verbose(10, ompi_coll_base_framework.framework_output,
 373                 "bcast with k-ary tree : segment of size %ld", length);
 374 
 375             /* compute the triggering threshold to send data to the children */
 376             trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
 377             if (rank != root) trig_thr ++;
 378 
 379             /*
 380              ** Send Data to children
 381              */
 382 
 383             for (i = 0 ; i < COLL_PORTALS4_BCAST_MAX_CHILDREN ; i++) {
 384                 if (child[i] != PTL_INVALID_RANK) {
 385 
 386                     if ((ret = PtlTriggeredPut (data_md_h,
 387                             (uint64_t) request->u.bcast.tmpbuf + offset,
 388                             length, PTL_NO_ACK_REQ,
 389                             ompi_coll_portals4_get_peer(comm, child[i]),
 390                             mca_coll_portals4_component.pt_idx,
 391                             match_bits, 0,
 392                             NULL,
 393                             0, request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 394                         return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 395                     }
 396                 }
 397             }
 398         }
 399 
 400         if (rank == root) {
 401             trig_thr = segment_nb;
 402             ct_inc.success = segment_nb;
 403             ct_inc.failure = 0;
 404 
 405             if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 406                    request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 407                 return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 408             }
 409         }
 410 
 411         ack_thr = child_nb;
 412 
 413         if (is_sync) {
 414             if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, ack_thr, &ct)) != 0)
 415                 opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
 416         }
 417         else {
 418             if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 419                     ompi_coll_portals4_get_peer(comm, rank),
 420                     mca_coll_portals4_component.finish_pt_idx,
 421                     0, 0, NULL, (uintptr_t) request,
 422                     request->u.bcast.ack_ct_h,
 423                     ack_thr)) != 0) {
 424                 return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 425             }
 426         }
 427     }
 428     else {
 429         /* A leaf of the tree does not need to send data to its children */
 430         request->u.bcast.rtr_ct_h = PTL_INVALID_HANDLE;
 431         request->u.bcast.ack_ct_h = PTL_INVALID_HANDLE;
 432 
 433         /* a leaf does not expect counting events from its children,
 434          * the threshold is computed using the number of segments received
 435          * from the parent
 436          */
 437 
 438         if (rank != root) {
 439             trig_thr = segment_nb;
 440             if (is_sync) {
 441                 /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
 442                    Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
 443 
 444                    This is necessary because portals4 does not insure the order in the triggered operations associated
 445                    with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
 446 
 447                 ct_inc.success = 1;
 448                 ct_inc.failure = 0;
 449 
 450                 if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 451                         request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 452                     return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 453                 }
 454 
 455                 if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 456                         request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
 457                     return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 458                 }
 459 
 460                 if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
 461                     opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
 462                 }
 463             }
 464             else {
 465                 if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 466                         ompi_coll_portals4_get_peer(comm, rank),
 467                         mca_coll_portals4_component.finish_pt_idx,
 468                         0, 0, NULL, (uintptr_t) request,
 469                         request->u.bcast.trig_ct_h,
 470                         trig_thr)) != 0) {
 471                     return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 472                 }
 473 
 474             }
 475         }
 476     }
 477     return (OMPI_SUCCESS);
 478 }
 479 
 480 
 481 static int
 482 bcast_pipeline_top(void *buff, int count,
 483         struct ompi_datatype_t *datatype, int root,
 484         struct ompi_communicator_t *comm,
 485         ompi_coll_portals4_request_t *request,
 486         mca_coll_portals4_module_t *portals4_module)
 487 {
 488     bool is_sync = request->is_sync;
 489     int ret;
 490     unsigned int seg, seg_size, nb_long;
 491     unsigned int segment_nb = request->u.bcast.segment_nb;
 492     int size = ompi_comm_size(comm);
 493     int rank = ompi_comm_rank(comm);
 494     ptl_rank_t parent, child;
 495     size_t internal_count, length, offset;
 496     ptl_handle_md_t zero_md_h, data_md_h;
 497     ptl_handle_me_t me_h;
 498     ptl_ct_event_t ct_inc;
 499     ptl_me_t me;
 500     ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits;
 501     ptl_ct_event_t ct;
 502     ptl_size_t trig_thr;
 503 
 504     /*
 505      ** Initialization
 506      */
 507 
 508     request->type = OMPI_COLL_PORTALS4_TYPE_BCAST;
 509 
 510     child = PTL_INVALID_RANK;
 511     parent = PTL_INVALID_RANK;
 512 
 513     zero_md_h = mca_coll_portals4_component.zero_md_h;
 514     data_md_h = mca_coll_portals4_component.data_md_h;
 515 
 516     internal_count = opal_atomic_add_fetch_size_t(&portals4_module->coll_count, 1);
 517 
 518     /*
 519      ** DATATYPE and SIZES
 520      */
 521 
 522     get_pipeline(rank, size, root, &parent, &child);
 523     request->u.bcast.u.child = child;
 524 
 525     /*
 526      * PORTALS4 RESOURCE ALLOCATION
 527      */
 528 
 529     if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.trig_ct_h)) != 0) {
 530         return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 531     }
 532 
 533     /* Compute match bits */
 534     COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0,
 535             COLL_PORTALS4_BCAST, 0, internal_count);
 536 
 537     COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1,
 538             COLL_PORTALS4_BCAST, 0, internal_count);
 539 
 540     COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0,
 541             COLL_PORTALS4_BCAST, 0, internal_count);
 542     /* The data will be cut in segment_nb segments.
 543      * nb_long segments will have a size of (seg_size + 1)
 544      * and (segment_nb - nb_long) segments will have a size of seg_size
 545      */
 546     seg_size = request->u.bcast.tmpsize / segment_nb;
 547     nb_long = request->u.bcast.tmpsize % segment_nb;
 548     opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d", seg_size, nb_long);
 549 
 550     if (rank != root) {
 551         for (seg = 1, offset = 0, length = 0 ;
 552                 seg <= segment_nb ;
 553                 seg++, offset += length) {
 554 
 555             /* Divide buffer into segments */
 556             if (seg <= nb_long) length = seg_size + 1;
 557             else length = seg_size;
 558 
 559             /*
 560              ** Prepare Data ME
 561              */
 562 
 563             memset(&me, 0, sizeof(ptl_me_t));
 564             me.start = ((uint8_t*) request->u.bcast.tmpbuf) + offset;
 565             me.length = length;
 566             me.ct_handle = request->u.bcast.trig_ct_h;
 567             me.uid = mca_coll_portals4_component.uid;
 568             me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 569                     PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 570                     PTL_ME_USE_ONCE |
 571                     PTL_ME_EVENT_CT_COMM;
 572             me.match_id.phys.nid = PTL_NID_ANY;
 573             me.match_id.phys.pid = PTL_PID_ANY;
 574             me.match_bits = match_bits;
 575             me.ignore_bits = 0;
 576             if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 577                     mca_coll_portals4_component.pt_idx,
 578                     &me,
 579                     PTL_PRIORITY_LIST,
 580                     NULL,
 581                     &me_h)) != 0) {
 582                 return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 583             }
 584         }
 585 
 586         /*
 587          * Send RTR to parent
 588          *
 589          * the root does not to have to do it, since it does not have parent.
 590          * WE can do such an operation by now, since we are able to receive data,
 591          * even if we are not able to receive the others.
 592          *
 593          */
 594 
 595         /* and there, we only send the RTR when all the MEs are ready */
 596         if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 597                 ompi_coll_portals4_get_peer(comm, parent),
 598                 mca_coll_portals4_component.pt_idx, match_bits_rtr,
 599                 0, NULL, 0)) != PTL_OK) {
 600             return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret);
 601         }
 602 
 603         /*
 604          * Prepare Triggered Put to ACK Data to parent
 605          *
 606          */
 607 
 608         trig_thr = (child != PTL_INVALID_RANK) ?
 609                 (segment_nb * 2) :
 610                 segment_nb;
 611 
 612         if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 613                 ompi_coll_portals4_get_peer(comm, parent),
 614                 mca_coll_portals4_component.pt_idx,
 615                 match_bits_ack, 0, NULL, 0,
 616                 request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 617             return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 618         }
 619     }
 620 
 621     if (child != PTL_INVALID_RANK) {
 622         if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.rtr_ct_h)) != 0) {
 623             return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 624         }
 625 
 626         ct_inc.success = segment_nb;
 627         ct_inc.failure = 0;
 628 
 629         if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 630                 request->u.bcast.rtr_ct_h, 1)) != 0) {
 631             return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 632         }
 633 
 634         if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.ack_ct_h)) != 0) {
 635             return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret);
 636         }
 637 
 638         /*
 639          ** Prepare ME for receiving data ACK Put
 640          ** Priority List
 641          */
 642 
 643         memset(&me, 0, sizeof(ptl_me_t));
 644         me.start = NULL;
 645         me.length = 0;
 646         me.min_free = 0;
 647         me.uid = mca_coll_portals4_component.uid;
 648         me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 649                 PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 650                 PTL_ME_USE_ONCE |
 651                 PTL_ME_EVENT_CT_COMM;
 652         me.match_id.phys.nid = PTL_NID_ANY;
 653         me.match_id.phys.pid = PTL_PID_ANY;
 654         me.match_bits = match_bits_ack;
 655         me.ignore_bits = 0;
 656         me.ct_handle = request->u.bcast.ack_ct_h;
 657 
 658         if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 659                 mca_coll_portals4_component.pt_idx,
 660                 &me, PTL_PRIORITY_LIST,  NULL,
 661                 &me_h)) != 0) {
 662             return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 663         }
 664 
 665         /*
 666          ** Prepare ME for sending RTR Put
 667          ** Priority List, match also with "Overflow list Me" in coll_portals4_component
 668          */
 669 
 670         memset(&me, 0, sizeof(ptl_me_t));
 671         me.start = NULL;
 672         me.length = 0;
 673         me.min_free = 0;
 674         me.uid = mca_coll_portals4_component.uid;
 675         me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
 676                 PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
 677                 PTL_ME_USE_ONCE |
 678                 PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW;
 679         me.match_id.phys.nid = PTL_NID_ANY;
 680         me.match_id.phys.pid = PTL_PID_ANY;
 681         me.match_bits = match_bits_rtr;
 682         me.ignore_bits = 0;
 683         me.ct_handle = request->u.bcast.rtr_ct_h;
 684 
 685         if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
 686                 mca_coll_portals4_component.pt_idx,
 687                 &me, PTL_PRIORITY_LIST,
 688                 NULL,
 689                 &me_h)) != 0) {
 690             return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret);
 691         }
 692 
 693         for (seg = 1, offset = 0, length = 0 ;
 694                 seg <= segment_nb ;
 695                 seg++, offset += length) {
 696 
 697             /* Divide buffer into segments */
 698             if (seg <= nb_long) length = seg_size + 1;
 699             else length = seg_size;
 700             opal_output_verbose(10, ompi_coll_base_framework.framework_output,
 701                 "bcast with pipeline  :  segment of size %ld \n", length);
 702 
 703             /* compute the triggering threshold to send data to the children */
 704             trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */
 705             if (rank != root) trig_thr ++;
 706 
 707             /*
 708              ** Send Data to children
 709              */
 710 
 711             if (child != PTL_INVALID_RANK) {
 712 
 713                 if ((ret = PtlTriggeredPut (data_md_h,
 714                         (uint64_t) request->u.bcast.tmpbuf + offset,
 715                         length, PTL_NO_ACK_REQ,
 716                         ompi_coll_portals4_get_peer(comm, child),
 717                         mca_coll_portals4_component.pt_idx,
 718                         match_bits, 0,
 719                         NULL,
 720                         0, request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 721                     return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 722                 }
 723             }
 724         }
 725         if (rank == root) {
 726             trig_thr = segment_nb;
 727             ct_inc.success = segment_nb;
 728             ct_inc.failure = 0;
 729 
 730             if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 731                    request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 732                 return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 733             }
 734         }
 735 
 736         if (is_sync) {
 737             if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, 1, &ct)) != 0) {
 738                 opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
 739             }
 740         }
 741         else {
 742             if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 743                     ompi_coll_portals4_get_peer(comm, rank),
 744                     mca_coll_portals4_component.finish_pt_idx,
 745                     0, 0, NULL, (uintptr_t) request,
 746                     request->u.bcast.ack_ct_h,
 747                     1)) != 0) {
 748                 return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 749             }
 750         }
 751     }
 752     else {
 753         /* A leaf of the tree does not need to send data to its children */
 754         request->u.bcast.rtr_ct_h = PTL_INVALID_HANDLE;
 755         request->u.bcast.ack_ct_h = PTL_INVALID_HANDLE;
 756 
 757         /* a leaf does not expect counting events from its children,
 758          * the threshold is computed using the number of segments received
 759          * from the parent
 760          */
 761 
 762         if (rank != root) {
 763             trig_thr = segment_nb;
 764 
 765             if (is_sync) {
 766                 /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
 767                    Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
 768 
 769                    This is necessary because portals4 does not insure the order in the triggered operations associated
 770                    with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
 771 
 772                 ct_inc.success = 1;
 773                 ct_inc.failure = 0;
 774 
 775                 if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 776                         request->u.bcast.trig_ct_h, trig_thr)) != 0) {
 777                     return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 778                 }
 779 
 780                 if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
 781                         request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
 782                     return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
 783                 }
 784 
 785                 if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
 786                     opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
 787                 }
 788             }
 789             else {
 790                 if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ,
 791                         ompi_coll_portals4_get_peer(comm, rank),
 792                         mca_coll_portals4_component.finish_pt_idx,
 793                         0, 0, NULL, (uintptr_t) request,
 794                         request->u.bcast.trig_ct_h,
 795                         trig_thr)) != 0) {
 796                     return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
 797                 }
 798             }
 799         }
 800     }
 801 
 802     return (OMPI_SUCCESS);
 803 }
 804 
 805 
 806 
 807 static int
 808 bcast_kary_tree_bottom(ompi_coll_portals4_request_t *request)
 809 {
 810     /* release all Portals4 resources for this request */
 811     if (request->u.bcast.u.child_nb) {
 812         PtlCTFree(request->u.bcast.rtr_ct_h);
 813         PtlCTFree(request->u.bcast.ack_ct_h);
 814     }
 815 
 816     PtlCTFree(request->u.bcast.trig_ct_h);
 817 
 818     return (OMPI_SUCCESS);
 819 }
 820 
 821 
 822 static int
 823 bcast_pipeline_bottom(ompi_coll_portals4_request_t *request)
 824 {
 825     /* release all Portals4 resources for this request */
 826     if (request->u.bcast.u.child != PTL_INVALID_RANK) {
 827         PtlCTFree(request->u.bcast.rtr_ct_h);
 828         PtlCTFree(request->u.bcast.ack_ct_h);
 829     }
 830 
 831     PtlCTFree(request->u.bcast.trig_ct_h);
 832     return (OMPI_SUCCESS);
 833 }
 834 
 835 
 836 int
 837 ompi_coll_portals4_bcast_intra(void *buff, int count,
 838         struct ompi_datatype_t *datatype, int root,
 839         struct ompi_communicator_t *comm,
 840         mca_coll_base_module_t *module)
 841 {
 842     mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
 843     ompi_coll_portals4_request_t *request;
 844 
 845     OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request);
 846     if (NULL == request) {
 847         opal_output_verbose(1, ompi_coll_base_framework.framework_output,
 848                 "%s:%d: request alloc failed\n",
 849                 __FILE__, __LINE__);
 850         return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
 851     }
 852     request->is_sync = true;
 853 
 854     prepare_bcast_data(comm, buff, count, datatype, root, request);
 855 
 856     switch (request->u.bcast.algo) {
 857     case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO:
 858         bcast_kary_tree_top(buff, count, datatype, root,
 859                 comm, request, portals4_module);
 860         bcast_kary_tree_bottom(request);
 861         break;
 862     case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO:
 863         bcast_pipeline_top(buff, count, datatype, root,
 864                 comm, request, portals4_module);
 865         bcast_pipeline_bottom(request);
 866         break;
 867     default:
 868         opal_output_verbose(1, ompi_coll_base_framework.framework_output,
 869                 "%s:%d: unknown bcast algorithm %d\n",
 870                 __FILE__, __LINE__, request->u.bcast.algo);
 871         return OMPI_ERROR;
 872     }
 873     post_bcast_data(request);
 874 
 875     OMPI_COLL_PORTALS4_REQUEST_RETURN(request);
 876     return (OMPI_SUCCESS);
 877 }
 878 
 879 
 880 int
 881 ompi_coll_portals4_ibcast_intra(void *buff, int count,
 882         struct ompi_datatype_t *datatype, int root,
 883         struct ompi_communicator_t *comm,
 884         ompi_request_t **ompi_request,
 885         mca_coll_base_module_t *module)
 886 {
 887 
 888     mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
 889     ompi_coll_portals4_request_t *request;
 890 
 891     OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request);
 892     if (NULL == request) {
 893         opal_output_verbose(1, ompi_coll_base_framework.framework_output,
 894                 "%s:%d: request alloc failed\n",
 895                 __FILE__, __LINE__);
 896         return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
 897     }
 898     *ompi_request = &request->super;
 899     request->is_sync = false;
 900 
 901     prepare_bcast_data(comm, buff, count, datatype, root, request);
 902 
 903     switch (request->u.bcast.algo) {
 904     case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO:
 905         bcast_kary_tree_top(buff, count, datatype, root,
 906                 comm, request, portals4_module);
 907         break;
 908     case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO:
 909         bcast_pipeline_top(buff, count, datatype, root,
 910                 comm, request, portals4_module);
 911         break;
 912     default:
 913         opal_output_verbose(1, ompi_coll_base_framework.framework_output,
 914                 "%s:%d: unknown bcast algorithm %d\n",
 915                 __FILE__, __LINE__, request->u.bcast.algo);
 916         return OMPI_ERROR;
 917     }
 918 
 919     opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra");
 920     return (OMPI_SUCCESS);
 921 }
 922 
 923 
 924 int
 925 ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request)
 926 {
 927 
 928     switch (request->u.bcast.algo) {
 929     case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO:
 930         bcast_kary_tree_bottom(request);
 931         break;
 932     case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO:
 933         bcast_pipeline_bottom(request);
 934         break;
 935     default:
 936         opal_output_verbose(1, ompi_coll_base_framework.framework_output,
 937                 "%s:%d: unknown bcast algorithm %d\n",
 938                 __FILE__, __LINE__, request->u.bcast.algo);
 939         return OMPI_ERROR;
 940     }
 941 
 942     post_bcast_data(request);
 943 
 944     ompi_request_complete(&request->super, true);
 945 
 946     opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra_fini");
 947     return (OMPI_SUCCESS);
 948 }

/* [<][>][^][v][top][bottom][index][help] */