root/ompi/mca/pml/ob1/pml_ob1_recvfrag.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. append_frag_to_list
  2. append_frag_to_umq
  3. append_frag_to_ordered_list
  4. remove_head_from_ordered_list
  5. check_cantmatch_for_match
  6. mca_pml_ob1_recv_frag_callback_match
  7. mca_pml_ob1_recv_frag_callback_rndv
  8. mca_pml_ob1_recv_frag_callback_rget
  9. mca_pml_ob1_recv_frag_callback_ack
  10. mca_pml_ob1_recv_frag_callback_frag
  11. mca_pml_ob1_recv_frag_callback_put
  12. mca_pml_ob1_recv_frag_callback_fin
  13. get_posted_recv
  14. get_next_posted_recv
  15. match_incomming
  16. match_incomming_no_any_source
  17. match_one
  18. mca_pml_ob1_recv_frag_match
  19. mca_pml_ob1_recv_frag_match_proc

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2019 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
  14  * Copyright (c) 2006-2008 University of Houston.  All rights reserved.
  15  * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
  16  * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
  17  *                         reserved.
  18  * Copyright (c) 2015      Research Organization for Information Science
  19  *                         and Technology (RIST). All rights reserved.
  20  * Copyright (c) 2018      Sandia National Laboratories
  21  *                         All rights reserved.
  22  * $COPYRIGHT$
  23  *
  24  * Additional copyrights may follow
  25  *
  26  * $HEADER$
  27  */
  28 
  29 /**
  30  * @file
  31  */
  32 
  33 #include "ompi_config.h"
  34 
  35 #include "opal/class/opal_list.h"
  36 #include "opal/threads/mutex.h"
  37 #include "opal/prefetch.h"
  38 
  39 #include "ompi/constants.h"
  40 #include "ompi/communicator/communicator.h"
  41 #include "ompi/mca/pml/pml.h"
  42 #include "ompi/peruse/peruse-internal.h"
  43 #include "ompi/memchecker.h"
  44 #include "ompi/runtime/ompi_spc.h"
  45 
  46 #include "pml_ob1.h"
  47 #include "pml_ob1_comm.h"
  48 #include "pml_ob1_recvfrag.h"
  49 #include "pml_ob1_recvreq.h"
  50 #include "pml_ob1_sendreq.h"
  51 #include "pml_ob1_hdr.h"
  52 #if OPAL_CUDA_SUPPORT
  53 #include "opal/datatype/opal_datatype_cuda.h"
  54 #include "opal/mca/common/cuda/common_cuda.h"
  55 #endif /* OPAL_CUDA_SUPPORT */
  56 
  57 OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
  58                     opal_free_list_item_t,
  59                     NULL,
  60                     NULL );
  61 
  62 OBJ_CLASS_INSTANCE( mca_pml_ob1_recv_frag_t,
  63                     opal_list_item_t,
  64                     NULL,
  65                     NULL );
  66 
  67 /**
  68  * Static functions.
  69  */
  70 
  71 /**
  72  * Append an unexpected descriptor to a queue. This function will allocate and
  73  * initialize the fragment (if necessary) and then will add it to the specified
  74  * queue. The allocated fragment is not returned to the caller.
  75  */
  76 
  77 static void
  78 append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
  79                     mca_pml_ob1_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
  80                     size_t num_segments, mca_pml_ob1_recv_frag_t* frag)
  81 {
  82     if(NULL == frag) {
  83         MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
  84         MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
  85     }
  86     opal_list_append(queue, (opal_list_item_t*)frag);
  87 }
  88 
  89 #if MCA_PML_OB1_CUSTOM_MATCH
  90 
  91 static void
  92 append_frag_to_umq(custom_match_umq *queue, mca_btl_base_module_t *btl,
  93                    mca_pml_ob1_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
  94                    size_t num_segments, mca_pml_ob1_recv_frag_t* frag)
  95 {
  96   if(NULL == frag) {
  97     MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
  98     MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
  99   }
 100   custom_match_umq_append(queue, hdr->hdr_tag, hdr->hdr_src, frag);
 101 }
 102 
 103 #endif
 104 
 105 
 106 /**
 107  * Append an unexpected descriptor to an ordered queue.
 108  *
 109  * use the opal_list_item_t to maintain themselves on an ordered list
 110  * according to their hdr_seq. Special care has been taken to cope with
 111  * overflowing the uint16_t we use for the hdr_seq. The current algorithm
 112  * works as long as there are no two elements with the same hdr_seq in the
 113  * list in same time (aka. no more than 2^16-1 left out-of-sequence
 114  * messages. On the vertical layer, messages with contiguous sequence
 115  * number organize themselves in a way to minimize the search space.
 116  */
 117 void
 118 append_frag_to_ordered_list(mca_pml_ob1_recv_frag_t** queue,
 119                             mca_pml_ob1_recv_frag_t *frag,
 120                             uint16_t seq)
 121 {
 122     mca_pml_ob1_recv_frag_t  *prior, *next;
 123     mca_pml_ob1_match_hdr_t *hdr;
 124 
 125     frag->super.super.opal_list_next = (opal_list_item_t*)frag;
 126     frag->super.super.opal_list_prev = (opal_list_item_t*)frag;
 127     frag->range = NULL;
 128     hdr = &frag->hdr.hdr_match;
 129 
 130     if( NULL == *queue ) {  /* no pending fragments yet */
 131         *queue = frag;
 132         return;
 133     }
 134 
 135     prior = *queue;
 136     assert(hdr->hdr_seq != prior->hdr.hdr_match.hdr_seq);
 137 
 138     /* The hdr_seq being 16 bits long it can rollover rather quickly. We need to
 139      * account for this rollover or the matching will fail.
 140      * Extract the items from the list to order them safely */
 141     if( hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq ) {
 142         uint16_t d1, d2 = prior->hdr.hdr_match.hdr_seq - hdr->hdr_seq;
 143         do {
 144             d1 = d2;
 145             prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_prev);
 146             d2 = prior->hdr.hdr_match.hdr_seq - hdr->hdr_seq;
 147         } while( (hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq) &&
 148                  (d1 > d2) && (prior != *queue) );
 149     } else {
 150         uint16_t prior_seq = prior->hdr.hdr_match.hdr_seq,
 151         next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq;
 152         /* prevent rollover */
 153         while( (hdr->hdr_seq > prior_seq) && (hdr->hdr_seq > next_seq) && (prior_seq < next_seq) ) {
 154             prior_seq = next_seq;
 155             prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next);
 156             next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq;
 157         }
 158     }
 159 
 160     /* prior is the fragment with a closest hdr_seq lesser than the current hdr_seq */
 161     mca_pml_ob1_recv_frag_t* parent = prior;
 162 
 163     /* Is this fragment the next in range ? */
 164     if( NULL == parent->range ) {
 165         if( (parent->hdr.hdr_match.hdr_seq + 1) == hdr->hdr_seq ) {
 166             parent->range = (mca_pml_ob1_recv_frag_t*)frag;
 167             goto merge_ranges;
 168         }
 169         /* all other cases fallback and add the frag after the parent */
 170     } else {
 171         /* can we add the frag to the range of the previous fragment ? */
 172         mca_pml_ob1_recv_frag_t* largest = (mca_pml_ob1_recv_frag_t*)parent->range->super.super.opal_list_prev;
 173         if( (largest->hdr.hdr_match.hdr_seq + 1) == hdr->hdr_seq ) {
 174             /* the frag belongs to this range */
 175             frag->super.super.opal_list_prev = (opal_list_item_t*)largest;
 176             frag->super.super.opal_list_next = largest->super.super.opal_list_next;
 177             frag->super.super.opal_list_prev->opal_list_next = (opal_list_item_t*)frag;
 178             frag->super.super.opal_list_next->opal_list_prev = (opal_list_item_t*)frag;
 179             goto merge_ranges;
 180         }
 181         /* all other cases fallback and add the frag after the parent */
 182     }
 183 
 184     frag->super.super.opal_list_prev = (opal_list_item_t*)prior;
 185     frag->super.super.opal_list_next = (opal_list_item_t*)prior->super.super.opal_list_next;
 186     frag->super.super.opal_list_prev->opal_list_next = (opal_list_item_t*)frag;
 187     frag->super.super.opal_list_next->opal_list_prev = (opal_list_item_t*)frag;
 188     parent = frag;  /* the frag is not part of a range yet */
 189 
 190     /* if the newly added element is closer to the next expected sequence mark it so */
 191     if( parent->hdr.hdr_match.hdr_seq >= seq )
 192         if( abs(parent->hdr.hdr_match.hdr_seq - seq) < abs((*queue)->hdr.hdr_match.hdr_seq - seq))
 193             *queue = parent;
 194 
 195  merge_ranges:
 196     /* is the next hdr_seq the increasing next one ? */
 197     next = (mca_pml_ob1_recv_frag_t*)parent->super.super.opal_list_next;
 198     uint16_t upper = parent->hdr.hdr_match.hdr_seq;
 199     if( NULL != parent->range ) {
 200         upper = ((mca_pml_ob1_recv_frag_t*)parent->range->super.super.opal_list_prev)->hdr.hdr_match.hdr_seq;
 201     }
 202     if( (upper + 1) == next->hdr.hdr_match.hdr_seq ) {
 203         /* remove next from the horizontal chain */
 204         next->super.super.opal_list_next->opal_list_prev = (opal_list_item_t*)parent;
 205         parent->super.super.opal_list_next = next->super.super.opal_list_next;
 206         /* merge next with it's own range */
 207         if( NULL != next->range ) {
 208             next->super.super.opal_list_next = (opal_list_item_t*)next->range;
 209             next->super.super.opal_list_prev = next->range->super.super.opal_list_prev;
 210             next->super.super.opal_list_next->opal_list_prev = (opal_list_item_t*)next;
 211             next->super.super.opal_list_prev->opal_list_next = (opal_list_item_t*)next;
 212             next->range = NULL;
 213         } else {
 214             next->super.super.opal_list_prev = (opal_list_item_t*)next;
 215             next->super.super.opal_list_next = (opal_list_item_t*)next;
 216         }
 217         if( NULL == parent->range ) {
 218             parent->range = next;
 219         } else {
 220             /* we have access to parent->range so make frag be it's predecessor */
 221             frag = (mca_pml_ob1_recv_frag_t*)parent->range->super.super.opal_list_prev;
 222             /* merge the 2 rings such that frag is right before next */
 223             frag->super.super.opal_list_next = (opal_list_item_t*)next;
 224             parent->range->super.super.opal_list_prev = next->super.super.opal_list_prev;
 225             next->super.super.opal_list_prev->opal_list_next = (opal_list_item_t*)parent->range;
 226             next->super.super.opal_list_prev = (opal_list_item_t*)frag;
 227         }
 228         if( next == *queue )
 229             *queue = parent;
 230     }
 231 }
 232 
 233 /*
 234  * remove the head of ordered list and restructure the list.
 235  */
 236 static mca_pml_ob1_recv_frag_t*
 237 remove_head_from_ordered_list(mca_pml_ob1_recv_frag_t** queue)
 238 {
 239     mca_pml_ob1_recv_frag_t* frag = *queue;
 240     /* queue is empty, nothing to see. */
 241     if( NULL == *queue )
 242         return NULL;
 243     if( NULL == frag->range ) {
 244         /* head has no range, */
 245         if( frag->super.super.opal_list_next == (opal_list_item_t*)frag ) {
 246             /* head points to itself means it is the only
 247              * one in this queue. We set the new head to NULL */
 248             *queue = NULL;
 249         } else {
 250             /* make the next one a new head. */
 251             *queue = (mca_pml_ob1_recv_frag_t*)frag->super.super.opal_list_next;
 252             frag->super.super.opal_list_next->opal_list_prev = frag->super.super.opal_list_prev;
 253             frag->super.super.opal_list_prev->opal_list_next = frag->super.super.opal_list_next;
 254         }
 255     } else {
 256         /* head has range */
 257         mca_pml_ob1_recv_frag_t* range = frag->range;
 258         frag->range = NULL;
 259         *queue = (mca_pml_ob1_recv_frag_t*)range;
 260         if( range->super.super.opal_list_next == (opal_list_item_t*)range ) {
 261             /* the range has no next element */
 262             assert( range->super.super.opal_list_prev == (opal_list_item_t*)range );
 263             range->range = NULL;
 264         } else {
 265             range->range = (mca_pml_ob1_recv_frag_t*)range->super.super.opal_list_next;
 266             /* remove the range from the vertical chain */
 267             range->super.super.opal_list_next->opal_list_prev = range->super.super.opal_list_prev;
 268             range->super.super.opal_list_prev->opal_list_next = range->super.super.opal_list_next;
 269         }
 270         /* replace frag with range in the horizontal range if not the only element */
 271         if( frag->super.super.opal_list_next == (opal_list_item_t*)frag ) {
 272             range->super.super.opal_list_next = (opal_list_item_t*)range;
 273             range->super.super.opal_list_prev = (opal_list_item_t*)range;
 274         } else {
 275             range->super.super.opal_list_next = frag->super.super.opal_list_next;
 276             range->super.super.opal_list_prev = frag->super.super.opal_list_prev;
 277             range->super.super.opal_list_next->opal_list_prev = (opal_list_item_t*)range;
 278             range->super.super.opal_list_prev->opal_list_next = (opal_list_item_t*)range;
 279         }
 280     }
 281     frag->super.super.opal_list_next = NULL;
 282     frag->super.super.opal_list_prev = NULL;
 283     return frag;
 284 }
 285 
 286 /**
 287  * Match incoming recv_frags against posted receives.
 288  * Supports out of order delivery.
 289  *
 290  * @param hdr (IN)                  Header of received recv_frag.
 291  * @param segments (IN)             Received recv_frag descriptor.
 292  * @param num_segments (IN)         Flag indicating wether a match was made.
 293  * @param type (IN)                 Type of the message header.
 294  * @return                          OMPI_SUCCESS or error status on failure.
 295  */
 296 static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
 297                                         mca_pml_ob1_match_hdr_t *hdr,
 298                                         mca_btl_base_segment_t* segments,
 299                                         size_t num_segments,
 300                                         int type );
 301 
 302 /**
 303  * Match incoming frags against posted receives. If frag is not NULL then we assume
 304  * it is already local and that it can be released upon completion.
 305  * Supports out of order delivery.
 306  *
 307  * @param comm_ptr (IN)             Communicator where the message has been received
 308  * @param proc (IN)                 Proc for which we have received the message.
 309  * @param hdr (IN)                  Header of received recv_frag.
 310  * @param segments (IN)             Received recv_frag descriptor.
 311  * @param num_segments (IN)         Flag indicating wether a match was made.
 312  * @param type (IN)                 Type of the message header.
 313  * @return                          OMPI_SUCCESS or error status on failure.
 314  */
 315 static int
 316 mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl,
 317                                   ompi_communicator_t* comm_ptr,
 318                                   mca_pml_ob1_comm_proc_t *proc,
 319                                   mca_pml_ob1_match_hdr_t *hdr,
 320                                   mca_btl_base_segment_t* segments,
 321                                   size_t num_segments,
 322                                   int type,
 323                                   mca_pml_ob1_recv_frag_t* frag );
 324 
 325 static mca_pml_ob1_recv_request_t*
 326 match_one(mca_btl_base_module_t *btl,
 327           mca_pml_ob1_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
 328           size_t num_segments, ompi_communicator_t *comm_ptr,
 329           mca_pml_ob1_comm_proc_t *proc,
 330           mca_pml_ob1_recv_frag_t* frag);
 331 
 332 mca_pml_ob1_recv_frag_t*
 333 check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc)
 334 {
 335     mca_pml_ob1_recv_frag_t *frag = proc->frags_cant_match;
 336 
 337     if( (NULL != frag) && (frag->hdr.hdr_match.hdr_seq == proc->expected_sequence) ) {
 338         return remove_head_from_ordered_list(&proc->frags_cant_match);
 339     }
 340     return NULL;
 341 }
 342 
 343 void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
 344                                           mca_btl_base_tag_t tag,
 345                                           mca_btl_base_descriptor_t* des,
 346                                           void* cbdata )
 347 {
 348     mca_btl_base_segment_t* segments = des->des_segments;
 349     mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
 350     ompi_communicator_t *comm_ptr;
 351     mca_pml_ob1_recv_request_t *match = NULL;
 352     mca_pml_ob1_comm_t *comm;
 353     mca_pml_ob1_comm_proc_t *proc;
 354     size_t num_segments = des->des_segment_count;
 355     size_t bytes_received = 0;
 356 
 357     assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
 358 
 359     if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) {
 360         return;
 361     }
 362     ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH);
 363 
 364     /* communicator pointer */
 365     comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
 366     if(OPAL_UNLIKELY(NULL == comm_ptr)) {
 367         /* This is a special case. A message for a not yet existing
 368          * communicator can happens. Instead of doing a matching we
 369          * will temporarily add it the a pending queue in the PML.
 370          * Later on, when the communicator is completely instantiated,
 371          * this pending queue will be searched and all matching fragments
 372          * moved to the right communicator.
 373          */
 374         append_frag_to_list( &mca_pml_ob1.non_existing_communicator_pending,
 375                              btl, hdr, segments, num_segments, NULL );
 376         return;
 377     }
 378     comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
 379 
 380     /* source sequence number */
 381     proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src);
 382 
 383     /* We generate the MSG_ARRIVED event as soon as the PML is aware
 384      * of a matching fragment arrival. Independing if it is received
 385      * on the correct order or not. This will allow the tools to
 386      * figure out if the messages are not received in the correct
 387      * order (if multiple network interfaces).
 388      */
 389     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
 390                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 391 
 392     /* get next expected message sequence number - if threaded
 393      * run, lock to make sure that if another thread is processing
 394      * a frag from the same message a match is made only once.
 395      * Also, this prevents other posted receives (for a pair of
 396      * end points) from being processed, and potentially "loosing"
 397      * the fragment.
 398      */
 399     OB1_MATCHING_LOCK(&comm->matching_lock);
 400 
 401     if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm_ptr)) {
 402         /* get sequence number of next message that can be processed.
 403          * If this frag is out of sequence, queue it up in the list
 404          * now as we still have the lock.
 405          */
 406         if(OPAL_UNLIKELY(((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence))) {
 407             mca_pml_ob1_recv_frag_t* frag;
 408             MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
 409             MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
 410             append_frag_to_ordered_list(&proc->frags_cant_match, frag, proc->expected_sequence);
 411             SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1);
 412             OB1_MATCHING_UNLOCK(&comm->matching_lock);
 413             return;
 414         }
 415 
 416         /* We're now expecting the next sequence number. */
 417         proc->expected_sequence++;
 418     }
 419 
 420     /* We generate the SEARCH_POSTED_QUEUE only when the message is
 421      * received in the correct sequence. Otherwise, we delay the event
 422      * generation until we reach the correct sequence number.
 423      */
 424     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
 425                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 426 
 427     match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL);
 428 
 429     /* The match is over. We generate the SEARCH_POSTED_Q_END here,
 430      * before going into check_cantmatch_for_match so we can make
 431      * a difference for the searching time for all messages.
 432      */
 433     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
 434                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 435 
 436     /* release matching lock before processing fragment */
 437     OB1_MATCHING_UNLOCK(&comm->matching_lock);
 438 
 439     if(OPAL_LIKELY(match)) {
 440         bytes_received = segments->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN;
 441         /* We don't need to know the total amount of bytes we just received,
 442          * but we need to know if there is any data in this message. The
 443          * simplest way is to get the extra length from the first segment,
 444          * and then add the number of remaining segments.
 445          */
 446         match->req_recv.req_bytes_packed = bytes_received + (num_segments-1);
 447 
 448         MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr);
 449         if(match->req_bytes_expected > 0) {
 450             struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS];
 451             uint32_t iov_count = 1;
 452 
 453             /*
 454              *  Make user buffer accessable(defined) before unpacking.
 455              */
 456             MEMCHECKER(
 457                        memchecker_call(&opal_memchecker_base_mem_defined,
 458                                        match->req_recv.req_base.req_addr,
 459                                        match->req_recv.req_base.req_count,
 460                                        match->req_recv.req_base.req_datatype);
 461                        );
 462 
 463             iov[0].iov_len = bytes_received;
 464             iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
 465                                               OMPI_PML_OB1_MATCH_HDR_LEN);
 466             while (iov_count < num_segments) {
 467                 bytes_received += segments[iov_count].seg_len;
 468                 iov[iov_count].iov_len = segments[iov_count].seg_len;
 469                 iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
 470                 iov_count++;
 471             }
 472             opal_convertor_unpack( &match->req_recv.req_base.req_convertor,
 473                                    iov,
 474                                    &iov_count,
 475                                    &bytes_received );
 476             match->req_bytes_received = bytes_received;
 477             SPC_USER_OR_MPI(match->req_recv.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)bytes_received,
 478                             OMPI_SPC_BYTES_RECEIVED_USER, OMPI_SPC_BYTES_RECEIVED_MPI);
 479             /*
 480              *  Unpacking finished, make the user buffer unaccessable again.
 481              */
 482             MEMCHECKER(
 483                        memchecker_call(&opal_memchecker_base_mem_noaccess,
 484                                        match->req_recv.req_base.req_addr,
 485                                        match->req_recv.req_base.req_count,
 486                                        match->req_recv.req_base.req_datatype);
 487                        );
 488         }
 489 
 490         /* no need to check if complete we know we are.. */
 491         /*  don't need a rmb as that is for checking */
 492         recv_request_pml_complete(match);
 493     }
 494 
 495     /* We matched the frag, Now see if we already have the next sequence in
 496      * our OOS list. If yes, try to match it.
 497      *
 498      * NOTE:
 499      * To optimize the number of lock used, mca_pml_ob1_recv_frag_match_proc()
 500      * MUST be called with communicator lock and will RELEASE the lock. This is
 501      * not ideal but it is better for the performance.
 502      */
 503     if(NULL != proc->frags_cant_match) {
 504         mca_pml_ob1_recv_frag_t* frag;
 505 
 506         OB1_MATCHING_LOCK(&comm->matching_lock);
 507         if((frag = check_cantmatch_for_match(proc))) {
 508             /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
 509             mca_pml_ob1_recv_frag_match_proc(frag->btl, comm_ptr, proc,
 510                                              &frag->hdr.hdr_match,
 511                                              frag->segments, frag->num_segments,
 512                                              frag->hdr.hdr_match.hdr_common.hdr_type, frag);
 513         } else {
 514             OB1_MATCHING_UNLOCK(&comm->matching_lock);
 515         }
 516     }
 517 
 518     return;
 519 }
 520 
 521 
 522 void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
 523                                          mca_btl_base_tag_t tag,
 524                                          mca_btl_base_descriptor_t* des,
 525                                          void* cbdata )
 526 {
 527     mca_btl_base_segment_t* segments = des->des_segments;
 528     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
 529 
 530     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
 531         return;
 532     }
 533     ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
 534     mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
 535                                 des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
 536     return;
 537 }
 538 
 539 void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
 540                                          mca_btl_base_tag_t tag,
 541                                          mca_btl_base_descriptor_t* des,
 542                                          void* cbdata )
 543 {
 544     mca_btl_base_segment_t* segments = des->des_segments;
 545     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
 546 
 547     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
 548         return;
 549     }
 550     ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
 551     mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
 552                                 des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
 553     return;
 554 }
 555 
 556 
 557 
 558 void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
 559                                         mca_btl_base_tag_t tag,
 560                                         mca_btl_base_descriptor_t* des,
 561                                         void* cbdata )
 562 {
 563     mca_btl_base_segment_t* segments = des->des_segments;
 564     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
 565     mca_pml_ob1_send_request_t* sendreq;
 566     size_t size;
 567 
 568     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
 569          return;
 570     }
 571 
 572     ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK);
 573     sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
 574     sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
 575 
 576     /* if the request should be delivered entirely by copy in/out
 577      * then throttle sends */
 578     if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
 579         if (NULL != sendreq->rdma_frag) {
 580             MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
 581             sendreq->rdma_frag = NULL;
 582         }
 583 
 584         sendreq->req_throttle_sends = true;
 585     }
 586 
 587     if (hdr->hdr_ack.hdr_send_size) {
 588         size = hdr->hdr_ack.hdr_send_size;
 589     } else {
 590         size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
 591     }
 592 
 593     mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
 594 
 595     if (sendreq->req_state != 0) {
 596         /* Typical receipt of an ACK message causes req_state to be
 597          * decremented. However, a send request that started as an
 598          * RGET request can become a RNDV. For example, when the
 599          * receiver determines that its receive buffer is not
 600          * contiguous and therefore cannot support the RGET
 601          * protocol. A send request that started with the RGET
 602          * protocol has req_state == 0 and as such should not be
 603          * decremented.
 604          */
 605         OPAL_THREAD_ADD_FETCH32(&sendreq->req_state, -1);
 606     }
 607 
 608 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
 609     if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
 610         (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
 611         /* The user's buffer is GPU and this BTL can support asynchronous copies,
 612          * so adjust the convertor accordingly.  All the subsequent fragments will
 613          * use the asynchronous copy. */
 614         void *strm = mca_common_cuda_get_dtoh_stream();
 615         opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
 616     }
 617 #endif /* OPAL_CUDA_SUPPORT */
 618 
 619     if(send_request_pml_complete_check(sendreq) == false)
 620         mca_pml_ob1_send_request_schedule(sendreq);
 621 
 622     return;
 623 }
 624 
 625 void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
 626                                          mca_btl_base_tag_t tag,
 627                                          mca_btl_base_descriptor_t* des,
 628                                          void* cbdata ) {
 629     mca_btl_base_segment_t* segments = des->des_segments;
 630     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
 631     mca_pml_ob1_recv_request_t* recvreq;
 632 
 633     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
 634         return;
 635     }
 636 
 637     ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
 638     recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
 639 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 640     /* If data is destined for GPU buffer and convertor was set up for asynchronous
 641      * copies, then start the copy and return.  The copy completion will trigger
 642      * the next phase. */
 643     if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) {
 644         assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
 645 
 646         /* This will trigger the opal_convertor_pack to start asynchronous copy. */
 647         mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
 648 
 649         /* Let BTL know that it CANNOT free the frag */
 650         des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
 651 
 652         return;
 653     }
 654 #endif /* OPAL_CUDA_SUPPORT */
 655 
 656     mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
 657 
 658     return;
 659 }
 660 
 661 
 662 void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
 663                                         mca_btl_base_tag_t tag,
 664                                         mca_btl_base_descriptor_t* des,
 665                                         void* cbdata ) {
 666     mca_btl_base_segment_t* segments = des->des_segments;
 667     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
 668     mca_pml_ob1_send_request_t* sendreq;
 669 
 670     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
 671         return;
 672     }
 673 
 674     ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_PUT);
 675     sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
 676     mca_pml_ob1_send_request_put(sendreq,btl,&hdr->hdr_rdma);
 677 
 678     return;
 679 }
 680 
 681 
 682 void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
 683                                         mca_btl_base_tag_t tag,
 684                                         mca_btl_base_descriptor_t* des,
 685                                         void* cbdata ) {
 686     mca_btl_base_segment_t* segments = des->des_segments;
 687     mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
 688     mca_pml_ob1_rdma_frag_t *frag;
 689 
 690     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
 691         return;
 692     }
 693 
 694     ob1_hdr_ntoh((union mca_pml_ob1_hdr_t *)hdr, MCA_PML_OB1_HDR_TYPE_FIN);
 695     frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
 696     frag->cbfunc (frag, hdr->hdr_size);
 697 }
 698 
 699 
 700 
 701 #define PML_MAX_SEQ ~((mca_pml_sequence_t)0);
 702 
 703 static inline mca_pml_ob1_recv_request_t* get_posted_recv(opal_list_t *queue)
 704 {
 705     if(opal_list_get_size(queue) == 0)
 706         return NULL;
 707 
 708     return (mca_pml_ob1_recv_request_t*)opal_list_get_first(queue);
 709 }
 710 
 711 static inline mca_pml_ob1_recv_request_t* get_next_posted_recv(
 712         opal_list_t *queue,
 713         mca_pml_ob1_recv_request_t* req)
 714 {
 715     opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req);
 716 
 717     if(opal_list_get_end(queue) == i)
 718         return NULL;
 719 
 720     return (mca_pml_ob1_recv_request_t*)i;
 721 }
 722 
 723 static mca_pml_ob1_recv_request_t *match_incomming(
 724         mca_pml_ob1_match_hdr_t *hdr, mca_pml_ob1_comm_t *comm,
 725         mca_pml_ob1_comm_proc_t *proc)
 726 {
 727 #if !MCA_PML_OB1_CUSTOM_MATCH
 728     mca_pml_ob1_recv_request_t *specific_recv, *wild_recv;
 729     mca_pml_sequence_t wild_recv_seq, specific_recv_seq;
 730     int tag = hdr->hdr_tag;
 731 
 732     specific_recv = get_posted_recv(&proc->specific_receives);
 733     wild_recv = get_posted_recv(&comm->wild_receives);
 734 
 735     wild_recv_seq = wild_recv ?
 736         wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
 737     specific_recv_seq = specific_recv ?
 738         specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
 739 
 740     /* they are equal only if both are PML_MAX_SEQ */
 741     while(wild_recv_seq != specific_recv_seq) {
 742         mca_pml_ob1_recv_request_t **match;
 743         opal_list_t *queue;
 744         int req_tag;
 745         mca_pml_sequence_t *seq;
 746 
 747         if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) {
 748             match = &wild_recv;
 749             queue = &comm->wild_receives;
 750             seq = &wild_recv_seq;
 751         } else {
 752             match = &specific_recv;
 753             queue = &proc->specific_receives;
 754             seq = &specific_recv_seq;
 755         }
 756 
 757         req_tag = (*match)->req_recv.req_base.req_tag;
 758         if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
 759             opal_list_remove_item(queue, (opal_list_item_t*)(*match));
 760             PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
 761                     &((*match)->req_recv.req_base), PERUSE_RECV);
 762             return *match;
 763         }
 764 
 765         *match = get_next_posted_recv(queue, *match);
 766         *seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ;
 767     }
 768 
 769     return NULL;
 770 #else
 771     return custom_match_prq_find_dequeue_verify(comm->prq, hdr->hdr_tag, hdr->hdr_src);
 772 #endif
 773 }
 774 
 775 #if !MCA_PML_OB1_CUSTOM_MATCH
 776 static mca_pml_ob1_recv_request_t *match_incomming_no_any_source (
 777         mca_pml_ob1_match_hdr_t *hdr, mca_pml_ob1_comm_t *comm,
 778         mca_pml_ob1_comm_proc_t *proc)
 779 {
 780     mca_pml_ob1_recv_request_t *recv_req;
 781     int tag = hdr->hdr_tag;
 782 
 783     OPAL_LIST_FOREACH(recv_req, &proc->specific_receives, mca_pml_ob1_recv_request_t) {
 784         int req_tag = recv_req->req_recv.req_base.req_tag;
 785 
 786         if (req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
 787             opal_list_remove_item (&proc->specific_receives, (opal_list_item_t *) recv_req);
 788             PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
 789                     &(recv_req->req_recv.req_base), PERUSE_RECV);
 790             return recv_req;
 791         }
 792     }
 793 
 794     return NULL;
 795 }
 796 #endif
 797 
 798 static mca_pml_ob1_recv_request_t*
 799 match_one(mca_btl_base_module_t *btl,
 800           mca_pml_ob1_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
 801           size_t num_segments, ompi_communicator_t *comm_ptr,
 802           mca_pml_ob1_comm_proc_t *proc,
 803           mca_pml_ob1_recv_frag_t* frag)
 804 {
 805 #if SPC_ENABLE == 1
 806     opal_timer_t timer = 0;
 807 #endif
 808     SPC_TIMER_START(OMPI_SPC_MATCH_TIME, &timer);
 809 
 810     mca_pml_ob1_recv_request_t *match;
 811     mca_pml_ob1_comm_t *comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
 812 
 813     do {
 814 #if MCA_PML_OB1_CUSTOM_MATCH
 815         match = match_incomming(hdr, comm, proc);
 816 #else
 817         if (!OMPI_COMM_CHECK_ASSERT_NO_ANY_SOURCE (comm_ptr)) {
 818             match = match_incomming(hdr, comm, proc);
 819         } else {
 820             match = match_incomming_no_any_source (hdr, comm, proc);
 821         }
 822 #endif
 823 
 824         /* if match found, process data */
 825         if(OPAL_LIKELY(NULL != match)) {
 826             match->req_recv.req_base.req_proc = proc->ompi_proc;
 827 
 828             if(OPAL_UNLIKELY(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type)) {
 829                 /* complete the probe */
 830                 mca_pml_ob1_recv_request_matched_probe(match, btl, segments,
 831                                                        num_segments);
 832                 /* attempt to match actual request */
 833                 continue;
 834             } else if (MCA_PML_REQUEST_MPROBE == match->req_recv.req_base.req_type) {
 835                 /* create a receive frag and associate it with the
 836                    request, which is then completed so that it can be
 837                    restarted later during mrecv */
 838                 mca_pml_ob1_recv_frag_t *tmp;
 839                 if(NULL == frag) {
 840                     MCA_PML_OB1_RECV_FRAG_ALLOC(tmp);
 841                     MCA_PML_OB1_RECV_FRAG_INIT(tmp, hdr, segments, num_segments, btl);
 842                 } else {
 843                     tmp = frag;
 844                 }
 845 
 846                 match->req_recv.req_base.req_addr = tmp;
 847                 mca_pml_ob1_recv_request_matched_probe(match, btl, segments,
 848                                                        num_segments);
 849                 /* this frag is already processed, so we want to break out
 850                    of the loop and not end up back on the unexpected queue. */
 851                 SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer);
 852                 return NULL;
 853             }
 854 
 855             PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ,
 856                                     &(match->req_recv.req_base), PERUSE_RECV);
 857             SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer);
 858             return match;
 859         }
 860 
 861         /* if no match found, place on unexpected queue */
 862 #if MCA_PML_OB1_CUSTOM_MATCH
 863         append_frag_to_umq(comm->umq, btl, hdr, segments,
 864                             num_segments, frag);
 865 #else
 866         append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments,
 867                             num_segments, frag);
 868 #endif
 869         SPC_RECORD(OMPI_SPC_UNEXPECTED, 1);
 870         SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, 1);
 871         SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE);
 872         PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr,
 873                                hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 874         SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer);
 875         return NULL;
 876     } while(true);
 877 }
 878 
 879 /**
 880  * RCS/CTS receive side matching
 881  *
 882  * @param hdr list of parameters needed for matching
 883  *                    This list is also embeded in frag,
 884  *                    but this allows to save a memory copy when
 885  *                    a match is made in this routine. (IN)
 886  * @param frag   pointer to receive fragment which we want
 887  *                    to match (IN/OUT).  If a match is not made,
 888  *                    hdr is copied to frag.
 889  * @param match_made  parameter indicating if we matched frag/
 890  *                    hdr (OUT)
 891  * @param additional_matches  if a match is made with frag, we
 892  *                    may be able to match fragments that previously
 893  *                    have arrived out-of-order.  If this is the
 894  *                    case, the associated fragment descriptors are
 895  *                    put on this list for further processing. (OUT)
 896  *
 897  * @return OMPI error code
 898  *
 899  * This routine is used to try and match a newly arrived message fragment
 900  *   to pre-posted receives.  The following assumptions are made
 901  *   - fragments are received out of order
 902  *   - for long messages, e.g. more than one fragment, a RTS/CTS algorithm
 903  *       is used.
 904  *   - 2nd and greater fragments include a receive descriptor pointer
 905  *   - fragments may be dropped
 906  *   - fragments may be corrupt
 907  *   - this routine may be called simultaneously by more than one thread
 908  */
 909 static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
 910                                         mca_pml_ob1_match_hdr_t *hdr,
 911                                         mca_btl_base_segment_t* segments,
 912                                         size_t num_segments,
 913                                         int type)
 914 {
 915     /* local variables */
 916     uint16_t frag_msg_seq;
 917     uint16_t next_msg_seq_expected;
 918     ompi_communicator_t *comm_ptr;
 919     mca_pml_ob1_comm_t *comm;
 920     mca_pml_ob1_comm_proc_t *proc;
 921 
 922     /* communicator pointer */
 923     comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
 924     if(OPAL_UNLIKELY(NULL == comm_ptr)) {
 925         /* This is a special case. A message for a not yet existing
 926          * communicator can happens. Instead of doing a matching we
 927          * will temporarily add it the a pending queue in the PML.
 928          * Later on, when the communicator is completely instantiated,
 929          * this pending queue will be searched and all matching fragments
 930          * moved to the right communicator.
 931          */
 932         append_frag_to_list( &mca_pml_ob1.non_existing_communicator_pending,
 933                              btl, hdr, segments, num_segments, NULL );
 934         return OMPI_SUCCESS;
 935     }
 936     comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
 937 
 938     /* source sequence number */
 939     proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src);
 940 
 941     /* We generate the MSG_ARRIVED event as soon as the PML is aware
 942      * of a matching fragment arrival. Independing if it is received
 943      * on the correct order or not. This will allow the tools to
 944      * figure out if the messages are not received in the correct
 945      * order (if multiple network interfaces).
 946      */
 947     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
 948                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 949 
 950     /* get next expected message sequence number - if threaded
 951      * run, lock to make sure that if another thread is processing
 952      * a frag from the same message a match is made only once.
 953      * Also, this prevents other posted receives (for a pair of
 954      * end points) from being processed, and potentially "loosing"
 955      * the fragment.
 956      */
 957     OB1_MATCHING_LOCK(&comm->matching_lock);
 958 
 959     frag_msg_seq = hdr->hdr_seq;
 960     next_msg_seq_expected = (uint16_t)proc->expected_sequence;
 961 
 962     if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm_ptr)) {
 963         /* If the sequence number is wrong, queue it up for later. */
 964         if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) {
 965             mca_pml_ob1_recv_frag_t* frag;
 966             MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
 967             MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
 968             append_frag_to_ordered_list(&proc->frags_cant_match, frag, next_msg_seq_expected);
 969 
 970             SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1);
 971             SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1);
 972             SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE);
 973 
 974             OB1_MATCHING_UNLOCK(&comm->matching_lock);
 975             return OMPI_SUCCESS;
 976         }
 977     }
 978 
 979     /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
 980     return mca_pml_ob1_recv_frag_match_proc(btl, comm_ptr, proc, hdr,
 981                                             segments, num_segments,
 982                                             type, NULL);
 983 }
 984 
 985 
 986 /* mca_pml_ob1_recv_frag_match_proc() will match the given frag and
 987  * then try to match the next frag in sequence by looking into arrived
 988  * out of order frags in frags_cant_match list until it can't find one.
 989  *
 990  * ATTENTION: THIS FUNCTION MUST BE CALLED WITH COMMUNICATOR LOCK HELD.
 991  * THE LOCK WILL BE RELEASED UPON RETURN. USE WITH CARE. */
 992 static int
 993 mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl,
 994                                   ompi_communicator_t* comm_ptr,
 995                                   mca_pml_ob1_comm_proc_t *proc,
 996                                   mca_pml_ob1_match_hdr_t *hdr,
 997                                   mca_btl_base_segment_t* segments,
 998                                   size_t num_segments,
 999                                   int type,
1000                                   mca_pml_ob1_recv_frag_t* frag )
1001 {
1002     /* local variables */
1003     mca_pml_ob1_comm_t* comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
1004     mca_pml_ob1_recv_request_t *match = NULL;
1005 
1006     /* If we are here, this is the sequence number we were expecting,
1007      * so we can try matching it to already posted receives.
1008      */
1009 
1010  match_this_frag:
1011     /* We're now expecting the next sequence number. */
1012     /* NOTE: We should have checked for ALLOW_OVERTAKE comm flag here
1013      * but adding a branch in this critical path is not ideal for performance.
1014      * We decided to let it run the sequence number even we are not doing
1015      * anything with it. */
1016     proc->expected_sequence++;
1017 
1018     /* We generate the SEARCH_POSTED_QUEUE only when the message is
1019      * received in the correct sequence. Otherwise, we delay the event
1020      * generation until we reach the correct sequence number.
1021      */
1022     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
1023                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
1024 
1025     match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
1026 
1027     /* The match is over. We generate the SEARCH_POSTED_Q_END here,
1028      * before going into check_cantmatch_for_match we can make a
1029      * difference for the searching time for all messages.
1030      */
1031     PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
1032                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
1033 
1034     /* release matching lock before processing fragment */
1035     OB1_MATCHING_UNLOCK(&comm->matching_lock);
1036 
1037     if(OPAL_LIKELY(match)) {
1038         switch(type) {
1039         case MCA_PML_OB1_HDR_TYPE_MATCH:
1040             mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
1041             break;
1042         case MCA_PML_OB1_HDR_TYPE_RNDV:
1043             mca_pml_ob1_recv_request_progress_rndv(match, btl, segments, num_segments);
1044             break;
1045         case MCA_PML_OB1_HDR_TYPE_RGET:
1046             mca_pml_ob1_recv_request_progress_rget(match, btl, segments, num_segments);
1047             break;
1048         }
1049 
1050         if(OPAL_UNLIKELY(frag))
1051             MCA_PML_OB1_RECV_FRAG_RETURN(frag);
1052     }
1053 
1054     /*
1055      * Now that new message has arrived, check to see if
1056      * any fragments on the frags_cant_match list
1057      * may now be used to form new matchs
1058      */
1059     if(OPAL_UNLIKELY(NULL != proc->frags_cant_match)) {
1060         OB1_MATCHING_LOCK(&comm->matching_lock);
1061         if((frag = check_cantmatch_for_match(proc))) {
1062             hdr = &frag->hdr.hdr_match;
1063             segments = frag->segments;
1064             num_segments = frag->num_segments;
1065             btl = frag->btl;
1066             type = hdr->hdr_common.hdr_type;
1067             goto match_this_frag;
1068         }
1069         OB1_MATCHING_UNLOCK(&comm->matching_lock);
1070     }
1071 
1072     return OMPI_SUCCESS;
1073 }
1074 

/* [<][>][^][v][top][bottom][index][help] */