root/ompi/communicator/comm.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_comm_set
  2. ompi_comm_set_nb
  3. ompi_comm_group
  4. ompi_comm_create
  5. ompi_comm_split
  6. ompi_comm_split_type_get_part
  7. ompi_comm_split_verify
  8. ompi_comm_split_type
  9. ompi_comm_dup
  10. ompi_comm_dup_with_info
  11. ompi_comm_idup
  12. ompi_comm_idup_with_info
  13. ompi_comm_idup_internal
  14. ompi_comm_idup_getcid
  15. ompi_comm_idup_with_info_activate
  16. ompi_comm_idup_with_info_finish
  17. ompi_comm_create_group
  18. ompi_comm_compare
  19. ompi_comm_set_name
  20. ompi_comm_allgather_emulate_intra
  21. ompi_comm_free
  22. ompi_comm_get_rprocs
  23. ompi_comm_overlapping_groups
  24. ompi_comm_determine_first
  25. ompi_comm_dump
  26. rankkeycompare
  27. ompi_comm_enable
  28. ompi_comm_fill_rest
  29. ompi_comm_copy_topo

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007-2011 University of Houston. All rights reserved.
  14  * Copyright (c) 2007-2018 Cisco Systems, Inc.  All rights reserved
  15  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
  16  * Copyright (c) 2011-2013 Inria.  All rights reserved.
  17  * Copyright (c) 2011-2013 Universite Bordeaux 1
  18  * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
  19  * Copyright (c) 2012-2016 Los Alamos National Security, LLC.
  20  *                         All rights reserved.
  21  * Copyright (c) 2014-2017 Research Organization for Information Science
  22  *                         and Technology (RIST). All rights reserved.
  23  * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
  24  * Copyright (c) 2015      Mellanox Technologies. All rights reserved.
  25  * Copyright (c) 2017      IBM Corporation. All rights reserved.
  26  * $COPYRIGHT$
  27  *
  28  * Additional copyrights may follow
  29  *
  30  * $HEADER$
  31  */
  32 
  33 #include "ompi_config.h"
  34 #include <string.h>
  35 #include <stdio.h>
  36 
  37 #include "ompi/constants.h"
  38 #include "opal/mca/hwloc/base/base.h"
  39 #include "opal/dss/dss.h"
  40 #include "opal/mca/pmix/pmix.h"
  41 #include "opal/util/string_copy.h"
  42 
  43 #include "ompi/proc/proc.h"
  44 #include "opal/threads/mutex.h"
  45 #include "opal/util/bit_ops.h"
  46 #include "opal/util/output.h"
  47 #include "ompi/mca/topo/topo.h"
  48 #include "ompi/mca/topo/base/base.h"
  49 #include "ompi/dpm/dpm.h"
  50 
  51 #include "ompi/attribute/attribute.h"
  52 #include "ompi/communicator/communicator.h"
  53 #include "ompi/mca/pml/pml.h"
  54 #include "ompi/request/request.h"
  55 
  56 /*
  57 ** sort-function for MPI_Comm_split
  58 */
  59 static int rankkeycompare(const void *, const void *);
  60 
  61 /**
  62  * to fill the rest of the stuff for the communicator
  63  */
  64 static int ompi_comm_fill_rest (ompi_communicator_t *comm,
  65                                 int num_procs,
  66                                 ompi_proc_t **proc_pointers,
  67                                 int my_rank,
  68                                 ompi_errhandler_t *errh );
  69 /*
  70 ** typedef for the allgather_intra required in comm_split.
  71 ** the reason for introducing this abstraction is, that
  72 ** for Comm_split for inter-coms, we do not have this
  73 ** functions, so we need to emulate it.
  74 */
  75 typedef int ompi_comm_allgatherfct (void* inbuf, int incount, MPI_Datatype intype,
  76                                     void* outbuf, int outcount, MPI_Datatype outtype,
  77                                     ompi_communicator_t *comm,
  78                                     mca_coll_base_module_t *data);
  79 
  80 static int ompi_comm_allgather_emulate_intra (void* inbuf, int incount, MPI_Datatype intype,
  81                                               void* outbuf, int outcount,
  82                                               MPI_Datatype outtype,
  83                                               ompi_communicator_t *comm,
  84                                               mca_coll_base_module_t *data);
  85 
  86 static int ompi_comm_copy_topo (ompi_communicator_t *oldcomm,
  87                                 ompi_communicator_t *newcomm);
  88 
  89 /* idup with local group and info. the local group support is provided to support ompi_comm_set_nb */
  90 static int ompi_comm_idup_internal (ompi_communicator_t *comm, ompi_group_t *group, ompi_group_t *remote_group,
  91                                     opal_info_t *info, ompi_communicator_t **newcomm, ompi_request_t **req);
  92 
  93 
  94 /**********************************************************************/
  95 /**********************************************************************/
  96 /**********************************************************************/
  97 /*
  98  * This is the function setting all elements of a communicator.
  99  * All other routines are just used to determine these elements.
 100  */
 101 
 102 int ompi_comm_set ( ompi_communicator_t **ncomm,
 103                     ompi_communicator_t *oldcomm,
 104                     int local_size,
 105                     int *local_ranks,
 106                     int remote_size,
 107                     int *remote_ranks,
 108                     opal_hash_table_t *attr,
 109                     ompi_errhandler_t *errh,
 110                     bool copy_topocomponent,
 111                     ompi_group_t *local_group,
 112                     ompi_group_t *remote_group )
 113 {
 114     ompi_request_t *req;
 115     int rc;
 116 
 117     rc = ompi_comm_set_nb (ncomm, oldcomm, local_size, local_ranks, remote_size, remote_ranks,
 118                            attr, errh, copy_topocomponent, local_group, remote_group, &req);
 119     if (OMPI_SUCCESS != rc) {
 120         return rc;
 121     }
 122 
 123     if (NULL != req) {
 124         rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
 125     }
 126 
 127     return rc;
 128 }
 129 
 130 /*
 131  * if remote_group == &ompi_mpi_group_null, then the new communicator
 132  * is forced to be an inter communicator.
 133  */
 134 int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
 135                        ompi_communicator_t *oldcomm,
 136                        int local_size,
 137                        int *local_ranks,
 138                        int remote_size,
 139                        int *remote_ranks,
 140                        opal_hash_table_t *attr,
 141                        ompi_errhandler_t *errh,
 142                        bool copy_topocomponent,
 143                        ompi_group_t *local_group,
 144                        ompi_group_t *remote_group,
 145                        ompi_request_t **req )
 146 {
 147     ompi_communicator_t *newcomm = NULL;
 148     int ret;
 149 
 150     if (NULL != local_group) {
 151         local_size = ompi_group_size (local_group);
 152     }
 153 
 154     if ( (NULL != remote_group) && (&ompi_mpi_group_null.group != remote_group) ) {
 155         remote_size = ompi_group_size (remote_group);
 156     }
 157 
 158     *req = NULL;
 159 
 160     /* ompi_comm_allocate */
 161     newcomm = OBJ_NEW(ompi_communicator_t);
 162     if (NULL == newcomm) {
 163         return OMPI_ERR_OUT_OF_RESOURCE;
 164     }
 165     newcomm->super.s_info = NULL;
 166     /* fill in the inscribing hyper-cube dimensions */
 167     newcomm->c_cube_dim = opal_cube_dim(local_size);
 168     newcomm->c_id_available   = MPI_UNDEFINED;
 169     newcomm->c_id_start_index = MPI_UNDEFINED;
 170 
 171     if (NULL == local_group) {
 172         /* determine how the list of local_rank can be stored most
 173            efficiently */
 174         ret = ompi_group_incl(oldcomm->c_local_group, local_size,
 175                               local_ranks, &newcomm->c_local_group);
 176         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 177             return ret;
 178         }
 179     } else {
 180         newcomm->c_local_group = local_group;
 181         OBJ_RETAIN(newcomm->c_local_group);
 182     }
 183     newcomm->c_my_rank = newcomm->c_local_group->grp_my_rank;
 184 
 185     /* Set remote group and duplicate the local comm, if applicable */
 186     if ( NULL != remote_group ) {
 187         ompi_communicator_t *old_localcomm;
 188 
 189         if (&ompi_mpi_group_null.group == remote_group) {
 190             ret = ompi_group_incl(oldcomm->c_remote_group, remote_size,
 191                                   remote_ranks, &newcomm->c_remote_group);
 192             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 193                 return ret;
 194             }
 195         } else {
 196             newcomm->c_remote_group = remote_group;
 197             OBJ_RETAIN(newcomm->c_remote_group);
 198         }
 199 
 200         newcomm->c_flags |= OMPI_COMM_INTER;
 201 
 202         old_localcomm = OMPI_COMM_IS_INTRA(oldcomm) ? oldcomm : oldcomm->c_local_comm;
 203 
 204         /* NTH: use internal idup function that takes a local group argument */
 205         ompi_comm_idup_internal (old_localcomm, newcomm->c_local_group, NULL, NULL,
 206                                  &newcomm->c_local_comm, req);
 207     } else {
 208         newcomm->c_remote_group = newcomm->c_local_group;
 209         OBJ_RETAIN(newcomm->c_remote_group);
 210     }
 211 
 212     /* Check how many different jobids are represented in this communicator.
 213        Necessary for the disconnect of dynamic communicators. */
 214 
 215     if ( 0 < local_size && (OMPI_COMM_IS_INTRA(newcomm) || 0 <remote_size) ) {
 216         ompi_dpm_mark_dyncomm (newcomm);
 217     }
 218 
 219     /* Set error handler */
 220     newcomm->error_handler = errh;
 221     OBJ_RETAIN ( newcomm->error_handler );
 222 
 223     /* Set Topology, if required and if available */
 224     if ( copy_topocomponent && (NULL != oldcomm->c_topo) ) {
 225         /**
 226          * The MPI standard is pretty clear on this, the topology information
 227          * behave as info keys, and is copied only on MPI_Comm_dup.
 228          */
 229         if (OMPI_SUCCESS != (ret = ompi_comm_copy_topo(oldcomm, newcomm))) {
 230             ompi_comm_free(&newcomm);
 231             return ret;
 232         }
 233     }
 234 
 235     /* Copy attributes and call according copy functions, if required */
 236     if (NULL != oldcomm->c_keyhash) {
 237         if (NULL != attr) {
 238             ompi_attr_hash_init(&newcomm->c_keyhash);
 239             if (OMPI_SUCCESS != (ret = ompi_attr_copy_all (COMM_ATTR, oldcomm,
 240                                                            newcomm, attr,
 241                                                            newcomm->c_keyhash))) {
 242                 ompi_comm_free(&newcomm);
 243                 return ret;
 244             }
 245         }
 246     }
 247 
 248     *ncomm = newcomm;
 249     return (OMPI_SUCCESS);
 250 }
 251 
 252 
 253 /**********************************************************************/
 254 /**********************************************************************/
 255 /**********************************************************************/
 256 /*
 257 ** Counterpart to MPI_Comm_group. To be used within OMPI functions.
 258 */
 259 int ompi_comm_group ( ompi_communicator_t* comm, ompi_group_t **group )
 260 {
 261     /* increment reference counters for the group */
 262     OBJ_RETAIN(comm->c_local_group);
 263 
 264     *group = comm->c_local_group;
 265     return OMPI_SUCCESS;
 266 }
 267 
 268 /**********************************************************************/
 269 /**********************************************************************/
 270 /**********************************************************************/
 271 /*
 272 ** Counterpart to MPI_Comm_create. To be used within OMPI.
 273 */
 274 int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group,
 275                        ompi_communicator_t **newcomm )
 276 {
 277     ompi_communicator_t *newcomp = NULL;
 278     int rsize;
 279     int mode,i,j;
 280     int *allranks=NULL;
 281     int *rranks=NULL;
 282     int rc = OMPI_SUCCESS;
 283     ompi_group_t *remote_group = NULL;
 284 
 285     /* silence clang warning. newcomm should never be NULL */
 286     if (OPAL_UNLIKELY(NULL == newcomm)) {
 287         return OMPI_ERR_BAD_PARAM;
 288     }
 289 
 290     if ( OMPI_COMM_IS_INTER(comm) ) {
 291         int tsize;
 292         remote_group = &ompi_mpi_group_null.group;
 293 
 294         tsize = ompi_comm_remote_size(comm);
 295         allranks = (int *) malloc ( tsize * sizeof(int));
 296         if ( NULL == allranks ) {
 297             rc = OMPI_ERR_OUT_OF_RESOURCE;
 298             goto exit;
 299         }
 300 
 301         rc = comm->c_coll->coll_allgather ( &(group->grp_my_rank),
 302                                            1, MPI_INT, allranks,
 303                                            1, MPI_INT, comm,
 304                                            comm->c_coll->coll_allgather_module);
 305         if ( OMPI_SUCCESS != rc ) {
 306             goto exit;
 307         }
 308 
 309         /* Count number of procs in future remote group */
 310         for (rsize=0, i = 0; i < tsize; i++) {
 311             if ( MPI_UNDEFINED != allranks[i] ) {
 312                 rsize++;
 313             }
 314         }
 315 
 316         /* If any of those groups is empty, we have to return
 317            MPI_COMM_NULL */
 318         if ( 0 == rsize || 0 == group->grp_proc_count ) {
 319             newcomp = MPI_COMM_NULL;
 320             rc = OMPI_SUCCESS;
 321             goto exit;
 322         }
 323 
 324         /* Set proc-pointers for remote group */
 325         rranks = (int *) malloc ( rsize * sizeof(int));
 326         if ( NULL == rranks ) {
 327             rc = OMPI_ERR_OUT_OF_RESOURCE;
 328             goto exit;
 329         }
 330 
 331         for ( j = 0, i = 0; i < tsize; i++ ) {
 332             if ( MPI_UNDEFINED != allranks[i] ) {
 333                 rranks[j] = i;
 334                 j++;
 335             }
 336         }
 337         mode = OMPI_COMM_CID_INTER;
 338 
 339     } else {
 340         rsize  = 0;
 341         rranks = NULL;
 342         mode   = OMPI_COMM_CID_INTRA;
 343     }
 344 
 345     rc = ompi_comm_set ( &newcomp,                 /* new comm */
 346                          comm,                     /* old comm */
 347                          0,                        /* local array size */
 348                          NULL,                     /* local_ranks */
 349                          rsize,                    /* remote_size */
 350                          rranks,                   /* remote_ranks */
 351                          NULL,                     /* attrs */
 352                          comm->error_handler,      /* error handler */
 353                          false,                    /* dont copy the topo */
 354                          group,                    /* local group */
 355                          remote_group);            /* remote group */
 356 
 357     if ( OMPI_SUCCESS != rc ) {
 358         goto exit;
 359     }
 360 
 361     /* Determine context id. It is identical to f_2_c_handle */
 362     rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
 363     if ( OMPI_SUCCESS != rc ) {
 364         goto exit;
 365     }
 366 
 367     /* Set name for debugging purposes */
 368     snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d CREATE FROM %d",
 369              newcomp->c_contextid, comm->c_contextid );
 370 
 371     /* Activate the communicator and init coll-component */
 372     rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
 373     if ( OMPI_SUCCESS != rc ) {
 374         goto exit;
 375     }
 376 
 377 
 378     /* Check whether we are part of the new comm.
 379        If not, we have to free the structure again.
 380        However, we could not avoid the comm_nextcid step, since
 381        all processes of the original comm have to participate in
 382        that function call. Additionally, all errhandler stuff etc.
 383        has to be set to make ompi_comm_free happy */
 384     if ( MPI_UNDEFINED == newcomp->c_local_group->grp_my_rank ) {
 385         ompi_comm_free ( &newcomp );
 386     }
 387 
 388  exit:
 389     if ( NULL != allranks ) {
 390         free ( allranks );
 391     }
 392     if ( NULL != rranks ) {
 393         free ( rranks );
 394     }
 395 
 396     *newcomm = newcomp;
 397     return ( rc );
 398 }
 399 
 400 
 401 /**********************************************************************/
 402 /**********************************************************************/
 403 /**********************************************************************/
 404 /*
 405 ** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
 406 */
 407 int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
 408                      ompi_communicator_t **newcomm, bool pass_on_topo )
 409 {
 410     int myinfo[2];
 411     int size, my_size;
 412     int my_rsize=0;
 413     int mode;
 414     int rsize;
 415     int i, loc;
 416     int inter;
 417     int *results=NULL, *sorted=NULL;
 418     int *rresults=NULL, *rsorted=NULL;
 419     int rc=OMPI_SUCCESS;
 420     ompi_communicator_t *newcomp = NULL;
 421     int *lranks=NULL, *rranks=NULL;
 422     ompi_group_t * local_group=NULL, *remote_group=NULL;
 423 
 424     ompi_comm_allgatherfct *allgatherfct=NULL;
 425 
 426     /* Step 1: determine all the information for the local group */
 427     /* --------------------------------------------------------- */
 428 
 429     /* sort according to color and rank. Gather information from everyone */
 430     myinfo[0] = color;
 431     myinfo[1] = key;
 432 
 433     size     = ompi_comm_size ( comm );
 434     inter    = OMPI_COMM_IS_INTER(comm);
 435     if ( inter ) {
 436         allgatherfct = (ompi_comm_allgatherfct *)ompi_comm_allgather_emulate_intra;
 437     } else {
 438         allgatherfct = (ompi_comm_allgatherfct *)comm->c_coll->coll_allgather;
 439     }
 440 
 441     results  = (int*) malloc ( 2 * size * sizeof(int));
 442     if ( NULL == results ) {
 443         return OMPI_ERR_OUT_OF_RESOURCE;
 444     }
 445 
 446     rc = allgatherfct( myinfo, 2, MPI_INT, results, 2, MPI_INT, comm, comm->c_coll->coll_allgather_module );
 447     if ( OMPI_SUCCESS != rc ) {
 448         goto exit;
 449     }
 450 
 451     /* how many have the same color like me */
 452     for ( my_size = 0, i=0; i < size; i++) {
 453         if ( results[(2*i)+0] == color) {
 454             my_size++;
 455         }
 456     }
 457 
 458     /* silence clang warning. my_size should never be 0 here */
 459     if (OPAL_UNLIKELY(0 == my_size)) {
 460         rc = OMPI_ERR_BAD_PARAM;
 461         goto exit;
 462     }
 463 
 464     sorted = (int *) calloc (my_size * 2, sizeof (int));
 465     if ( NULL == sorted) {
 466         rc =  OMPI_ERR_OUT_OF_RESOURCE;
 467         goto exit;
 468     }
 469 
 470     /* ok we can now fill this info */
 471     for( loc = 0, i = 0; i < size; i++ ) {
 472         if ( results[(2*i)+0] == color) {
 473             sorted[(2*loc)+0] = i;                 /* copy org rank */
 474             sorted[(2*loc)+1] = results[(2*i)+1];  /* copy key */
 475             loc++;
 476         }
 477     }
 478 
 479     /* the new array needs to be sorted so that it is in 'key' order */
 480     /* if two keys are equal then it is sorted in original rank order! */
 481     if(my_size>1){
 482         qsort ((int*)sorted, my_size, sizeof(int)*2, rankkeycompare);
 483     }
 484 
 485     /* put group elements in a list */
 486     lranks = (int *) malloc ( my_size * sizeof(int));
 487     if ( NULL == lranks ) {
 488         rc = OMPI_ERR_OUT_OF_RESOURCE;
 489         goto exit;
 490     }
 491     for (i = 0; i < my_size; i++) {
 492         lranks[i] = sorted[i*2];
 493     }
 494 
 495     /* Step 2: determine all the information for the remote group */
 496     /* --------------------------------------------------------- */
 497     if ( inter ) {
 498         remote_group = &ompi_mpi_group_null.group;
 499         rsize    = comm->c_remote_group->grp_proc_count;
 500         rresults = (int *) malloc ( rsize * 2 * sizeof(int));
 501         if ( NULL == rresults ) {
 502             rc = OMPI_ERR_OUT_OF_RESOURCE;
 503             goto exit;
 504         }
 505 
 506         /* this is an allgather on an inter-communicator */
 507         rc = comm->c_coll->coll_allgather( myinfo, 2, MPI_INT, rresults, 2,
 508                                           MPI_INT, comm,
 509                                           comm->c_coll->coll_allgather_module);
 510         if ( OMPI_SUCCESS != rc ) {
 511             goto exit;
 512         }
 513 
 514         /* how many have the same color like me */
 515         for ( my_rsize = 0, i=0; i < rsize; i++) {
 516             if ( rresults[(2*i)+0] == color) {
 517                 my_rsize++;
 518             }
 519         }
 520 
 521         if (my_rsize > 0) {
 522             rsorted = (int *) calloc (my_rsize * 2, sizeof (int));
 523             if ( NULL == rsorted) {
 524                 rc = OMPI_ERR_OUT_OF_RESOURCE;
 525                 goto exit;
 526             }
 527 
 528             /* ok we can now fill this info */
 529             for( loc = 0, i = 0; i < rsize; i++ ) {
 530                 if ( rresults[(2*i)+0] == color) {
 531                     rsorted[(2*loc)+0] = i;                  /* org rank */
 532                     rsorted[(2*loc)+1] = rresults[(2*i)+1];  /* key */
 533                     loc++;
 534                 }
 535             }
 536 
 537             /* the new array needs to be sorted so that it is in 'key' order */
 538             /* if two keys are equal then it is sorted in original rank order! */
 539             if (my_rsize > 1) {
 540                 qsort ((int*)rsorted, my_rsize, sizeof(int)*2, rankkeycompare);
 541             }
 542 
 543             /* put group elements in a list */
 544             rranks = (int *) malloc ( my_rsize * sizeof(int));
 545             if ( NULL ==  rranks) {
 546                 rc = OMPI_ERR_OUT_OF_RESOURCE;
 547                 goto exit;
 548             }
 549 
 550             for (i = 0; i < my_rsize; i++) {
 551                 rranks[i] = rsorted[i*2];
 552             }
 553         }
 554 
 555         rc = ompi_group_incl(comm->c_local_group, my_size, lranks, &local_group);
 556         if (OMPI_SUCCESS != rc) {
 557             goto exit;
 558         }
 559 
 560         mode = OMPI_COMM_CID_INTER;
 561     } else {
 562         rranks = NULL;
 563         mode      = OMPI_COMM_CID_INTRA;
 564     }
 565 
 566     /* Step 3: set up the communicator                           */
 567     /* --------------------------------------------------------- */
 568     /* Create the communicator finally */
 569 
 570     rc = ompi_comm_set ( &newcomp,           /* new comm */
 571                          comm,               /* old comm */
 572                          my_size,            /* local_size */
 573                          lranks,             /* local_ranks */
 574                          my_rsize,           /* remote_size */
 575                          rranks,             /* remote_ranks */
 576                          NULL,               /* attrs */
 577                          comm->error_handler,/* error handler */
 578                          pass_on_topo,
 579                          local_group,       /* local group */
 580                          remote_group);     /* remote group */
 581 
 582     if ( OMPI_SUCCESS != rc  ) {
 583         goto exit;
 584     }
 585 
 586     if ( inter ) {
 587         OBJ_RELEASE(local_group);
 588         if (NULL != newcomp->c_local_comm) {
 589             snprintf(newcomp->c_local_comm->c_name, MPI_MAX_OBJECT_NAME,
 590                      "MPI COMMUNICATOR %d SPLIT FROM %d",
 591                      newcomp->c_local_comm->c_contextid,
 592                      comm->c_local_comm->c_contextid );
 593         }
 594     }
 595 
 596     /* set the rank to MPI_UNDEFINED. This prevents this process from interfering
 597      * in ompi_comm_nextcid() and the collective module selection in ompi_comm_activate()
 598      * for a communicator that will be freed anyway.
 599      */
 600     if ( MPI_UNDEFINED == color || (inter && my_rsize==0)) {
 601         newcomp->c_local_group->grp_my_rank = MPI_UNDEFINED;
 602     }
 603 
 604     /* Determine context id. It is identical to f_2_c_handle */
 605     rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
 606     if ( OMPI_SUCCESS != rc ) {
 607         goto exit;
 608     }
 609 
 610     /* Set name for debugging purposes */
 611     snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d",
 612              newcomp->c_contextid, comm->c_contextid );
 613 
 614 
 615 
 616     /* Activate the communicator and init coll-component */
 617     rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
 618 
 619  exit:
 620     free ( results );
 621     free ( sorted );
 622     free ( rresults );
 623     free ( rsorted );
 624     free ( lranks );
 625     free ( rranks );
 626 
 627     /* Step 4: if we are not part of the comm, free the struct   */
 628     /* --------------------------------------------------------- */
 629     if (inter && my_rsize == 0) {
 630         color = MPI_UNDEFINED;
 631     }
 632     if ( NULL != newcomp && MPI_UNDEFINED == color ) {
 633         ompi_comm_free ( &newcomp );
 634     }
 635 
 636     *newcomm = newcomp;
 637     return rc;
 638 }
 639 
 640 
 641 /**********************************************************************/
 642 /**********************************************************************/
 643 /**********************************************************************/
 644 /*
 645  * Produces an array of ranks that will be part of the local/remote group in the
 646  * new communicator. The results array will be modified by this call.
 647  */
 648 static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_type, int **ranks_out, int *rank_size) {
 649     int size = ompi_group_size (group);
 650     int my_size = 0;
 651     int *ranks;
 652     int ret;
 653 
 654     ranks = malloc (size * sizeof (int));
 655     if (OPAL_UNLIKELY(NULL == ranks)) {
 656         return OMPI_ERR_OUT_OF_RESOURCE;
 657     }
 658 
 659     for (int i = 0 ; i < size ; ++i) {
 660         ompi_proc_t *proc = ompi_group_get_proc_ptr_raw (group, i);
 661         uint16_t locality, *u16ptr;
 662         int include = false;
 663 
 664         if (ompi_proc_is_sentinel (proc)) {
 665             opal_process_name_t proc_name = ompi_proc_sentinel_to_name ((uintptr_t) proc);
 666 
 667             if (split_type <= OMPI_COMM_TYPE_HOST) {
 668                 /* local ranks should never be represented by sentinel procs. ideally we
 669                  * should be able to use OPAL_MODEX_RECV_VALUE_OPTIONAL but it does have
 670                  * some overhead. update this to use the optional recv if that is ever fixed. */
 671                 continue;
 672             }
 673 
 674             u16ptr = &locality;
 675 
 676             OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
 677             if (OPAL_SUCCESS != ret) {
 678                 continue;
 679             }
 680         } else {
 681             locality = proc->super.proc_flags;
 682         }
 683 
 684         switch (split_type) {
 685         case OMPI_COMM_TYPE_HWTHREAD:
 686             include = OPAL_PROC_ON_LOCAL_HWTHREAD(locality);
 687             break;
 688         case OMPI_COMM_TYPE_CORE:
 689             include = OPAL_PROC_ON_LOCAL_CORE(locality);
 690             break;
 691         case OMPI_COMM_TYPE_L1CACHE:
 692             include = OPAL_PROC_ON_LOCAL_L1CACHE(locality);
 693             break;
 694         case OMPI_COMM_TYPE_L2CACHE:
 695             include = OPAL_PROC_ON_LOCAL_L2CACHE(locality);
 696             break;
 697         case OMPI_COMM_TYPE_L3CACHE:
 698             include = OPAL_PROC_ON_LOCAL_L3CACHE(locality);
 699             break;
 700         case OMPI_COMM_TYPE_SOCKET:
 701             include = OPAL_PROC_ON_LOCAL_SOCKET(locality);
 702             break;
 703         case OMPI_COMM_TYPE_NUMA:
 704             include = OPAL_PROC_ON_LOCAL_NUMA(locality);
 705             break;
 706         case MPI_COMM_TYPE_SHARED:
 707             include = OPAL_PROC_ON_LOCAL_NODE(locality);
 708             break;
 709         case OMPI_COMM_TYPE_BOARD:
 710             include = OPAL_PROC_ON_LOCAL_BOARD(locality);
 711             break;
 712         case OMPI_COMM_TYPE_HOST:
 713             include = OPAL_PROC_ON_LOCAL_HOST(locality);
 714             break;
 715         case OMPI_COMM_TYPE_CU:
 716             include = OPAL_PROC_ON_LOCAL_CU(locality);
 717             break;
 718         case OMPI_COMM_TYPE_CLUSTER:
 719             include = OPAL_PROC_ON_LOCAL_CLUSTER(locality);
 720             break;
 721         }
 722 
 723         if (include) {
 724             ranks[my_size++] = i;
 725         }
 726     }
 727 
 728     *rank_size = my_size;
 729 
 730     /* silence a clang warning about a 0-byte malloc. my_size will never be 0 here */
 731     if (OPAL_UNLIKELY(0 == my_size)) {
 732         free (ranks);
 733         return OMPI_SUCCESS;
 734     }
 735 
 736     /* shrink the rank array */
 737     int *tmp = realloc (ranks, my_size * sizeof (int));
 738     if (OPAL_LIKELY(NULL != tmp)) {
 739         ranks = tmp;
 740     }
 741 
 742     *ranks_out = ranks;
 743 
 744     return OMPI_SUCCESS;
 745 }
 746 
 747 static int ompi_comm_split_verify (ompi_communicator_t *comm, int split_type, int key, bool *need_split)
 748 {
 749     int rank = ompi_comm_rank (comm);
 750     int size = ompi_comm_size (comm);
 751     int *results;
 752     int rc;
 753 
 754     if (*need_split) {
 755         return OMPI_SUCCESS;
 756     }
 757 
 758     results = malloc (2 * sizeof (int) * size);
 759     if (OPAL_UNLIKELY(NULL == results)) {
 760         return OMPI_ERR_OUT_OF_RESOURCE;
 761     }
 762 
 763     *need_split = false;
 764 
 765     results[rank * 2] = split_type;
 766     results[rank * 2 + 1] = key;
 767 
 768     rc = comm->c_coll->coll_allgather (MPI_IN_PLACE, 2, MPI_INT, results, 2, MPI_INT, comm,
 769                                       comm->c_coll->coll_allgather_module);
 770     if (OMPI_SUCCESS != rc) {
 771         free (results);
 772         return rc;
 773     }
 774 
 775     for (int i = 0 ; i < size ; ++i) {
 776         if (MPI_UNDEFINED == results[i * 2] || (i > 1 && results[i * 2 + 1] < results[i * 2 - 1])) {
 777             *need_split = true;
 778             break;
 779         }
 780     }
 781 
 782     free (results);
 783 
 784     return OMPI_SUCCESS;
 785 }
 786 
 787 int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
 788                           opal_info_t *info, ompi_communicator_t **newcomm)
 789 {
 790     bool need_split = false, no_reorder = false, no_undefined = false;
 791     ompi_communicator_t *newcomp = MPI_COMM_NULL;
 792     int my_size, my_rsize = 0, mode, inter;
 793     int *lranks = NULL, *rranks = NULL;
 794     int global_split_type, ok, tmp[4];
 795     int rc;
 796 
 797     /* silence clang warning. newcomm should never be NULL */
 798     if (OPAL_UNLIKELY(NULL == newcomm)) {
 799         return OMPI_ERR_BAD_PARAM;
 800     }
 801 
 802     inter = OMPI_COMM_IS_INTER(comm);
 803 
 804     /* Step 1: verify all ranks have supplied the same value for split type. All split types
 805      * must be the same or MPI_UNDEFINED (which is negative). */
 806     tmp[0] = split_type;
 807     tmp[1] = -split_type;
 808     tmp[2] = key;
 809     tmp[3] = -key;
 810 
 811     rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &tmp, 4, MPI_INT, MPI_MAX, comm,
 812                                       comm->c_coll->coll_allreduce_module);
 813     if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 814         return rc;
 815     }
 816 
 817     global_split_type = tmp[0];
 818 
 819     if (tmp[0] != -tmp[1] || inter) {
 820         /* at least one rank supplied a different split type check if our split_type is ok */
 821         ok = (MPI_UNDEFINED == split_type) || global_split_type == split_type;
 822 
 823         rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, comm,
 824                                           comm->c_coll->coll_allreduce_module);
 825         if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 826             return rc;
 827         }
 828 
 829         if (inter) {
 830             /* need an extra allreduce to ensure that all ranks have the same result */
 831             rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, comm,
 832                                               comm->c_coll->coll_allreduce_module);
 833             if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 834                 return rc;
 835             }
 836         }
 837 
 838         if (OPAL_UNLIKELY(!ok)) {
 839             return OMPI_ERR_BAD_PARAM;
 840         }
 841 
 842         need_split = tmp[0] == -tmp[1];
 843     } else {
 844         /* intracommunicator and all ranks specified the same split type */
 845         no_undefined = true;
 846         /* check if all ranks specified the same key */
 847         no_reorder = tmp[2] == -tmp[3];
 848     }
 849 
 850     if (MPI_UNDEFINED == global_split_type) {
 851         /* short-circut. every rank provided MPI_UNDEFINED */
 852         *newcomm = MPI_COMM_NULL;
 853         return OMPI_SUCCESS;
 854     }
 855 
 856     /* Step 2: Build potential communicator groups. If any ranks will not be part of
 857      * the ultimate communicator we will drop them later. This saves doing an extra
 858      * allgather on the whole communicator. By using ompi_comm_split() later only
 859      * if needed we 1) optimized the common case (no MPI_UNDEFINED and no reorder),
 860      * and 2) limit the allgather to a smaller set of peers in the uncommon case. */
 861     /* --------------------------------------------------------- */
 862 
 863     /* allowed splitting types:
 864        CLUSTER
 865        CU
 866        HOST
 867        BOARD
 868        NODE
 869        NUMA
 870        SOCKET
 871        L3CACHE
 872        L2CACHE
 873        L1CACHE
 874        CORE
 875        HWTHREAD
 876        Even though HWTHREAD/CORE etc. is overkill they are here for consistency.
 877        They will most likely return a communicator which is equal to MPI_COMM_SELF
 878        Unless oversubscribing.
 879     */
 880 
 881     /* how many ranks are potentially participating and on my node? */
 882     rc = ompi_comm_split_type_get_part (comm->c_local_group, global_split_type, &lranks, &my_size);
 883     if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 884         return rc;
 885     }
 886 
 887     /* Step 3: determine all the information for the remote group */
 888     /* --------------------------------------------------------- */
 889     if (inter) {
 890         rc = ompi_comm_split_type_get_part (comm->c_remote_group, global_split_type, &rranks, &my_rsize);
 891         if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 892             free (lranks);
 893             return rc;
 894         }
 895     }
 896 
 897     /* set the CID allgather mode to the appropriate one for the communicator */
 898     mode = inter ? OMPI_COMM_CID_INTER : OMPI_COMM_CID_INTRA;
 899 
 900     /* Step 4: set up the communicator                           */
 901     /* --------------------------------------------------------- */
 902     /* Create the communicator finally */
 903 
 904     do {
 905         rc = ompi_comm_set (&newcomp, comm, my_size, lranks, my_rsize,
 906                             rranks, NULL, comm->error_handler, false,
 907                             NULL, NULL);
 908         if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 909             break;
 910         }
 911 
 912         /* Determine context id. It is identical to f_2_c_handle */
 913         rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
 914         if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 915             break;
 916         }
 917 
 918         // Copy info if there is one.
 919         newcomp->super.s_info = OBJ_NEW(opal_info_t);
 920         if (info) {
 921             opal_info_dup(info, &(newcomp->super.s_info));
 922         }
 923 
 924         /* Activate the communicator and init coll-component */
 925         rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
 926         if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
 927             break;
 928         }
 929 
 930         /* Step 5: Check if we need to remove or reorder ranks in the communicator */
 931         if (!(no_reorder && no_undefined)) {
 932             rc = ompi_comm_split_verify (newcomp, split_type, key, &need_split);
 933 
 934             if (inter) {
 935                 /* verify that no local ranks need to be removed or reordered */
 936                 rc = ompi_comm_split_verify (newcomp->c_local_comm, split_type, key, &need_split);
 937             }
 938         }
 939 
 940         if (!need_split) {
 941             /* common case. no reordering and no MPI_UNDEFINED */
 942             *newcomm = newcomp;
 943 
 944             /* Set name for debugging purposes */
 945             snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT_TYPE FROM %d",
 946                      newcomp->c_contextid, comm->c_contextid );
 947             break;
 948         }
 949 
 950         /* TODO: there probably is better way to handle this case without throwing away the
 951          * intermediate communicator. */
 952         rc = ompi_comm_split (newcomp, split_type, key, newcomm, false);
 953         /* get rid of the intermediate communicator */
 954         ompi_comm_free (&newcomp);
 955     } while (0);
 956 
 957     if (OPAL_UNLIKELY(OMPI_SUCCESS != rc && MPI_COMM_NULL != newcomp)) {
 958         ompi_comm_free (&newcomp);
 959         *newcomm = MPI_COMM_NULL;
 960     }
 961 
 962     free (lranks);
 963     free (rranks);
 964 
 965     return rc;
 966 }
 967 
 968 /**********************************************************************/
 969 /**********************************************************************/
 970 /**********************************************************************/
 971 int ompi_comm_dup ( ompi_communicator_t * comm, ompi_communicator_t **newcomm )
 972 {
 973     return ompi_comm_dup_with_info (comm, NULL, newcomm);
 974 }
 975 
 976 /**********************************************************************/
 977 /**********************************************************************/
 978 /**********************************************************************/
 979 int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, ompi_communicator_t **newcomm )
 980 {
 981     ompi_communicator_t *newcomp = NULL;
 982     ompi_group_t *remote_group = NULL;
 983     int mode = OMPI_COMM_CID_INTRA, rc = OMPI_SUCCESS;
 984 
 985     if ( OMPI_COMM_IS_INTER ( comm ) ){
 986         mode   = OMPI_COMM_CID_INTER;
 987         remote_group = comm->c_remote_group;
 988     }
 989 
 990     *newcomm = MPI_COMM_NULL;
 991 
 992     rc =  ompi_comm_set ( &newcomp,                               /* new comm */
 993                           comm,                                   /* old comm */
 994                           0,                                      /* local array size */
 995                           NULL,                                   /* local_procs*/
 996                           0,                                      /* remote array size */
 997                           NULL,                                   /* remote_procs */
 998                           comm->c_keyhash,                        /* attrs */
 999                           comm->error_handler,                    /* error handler */
1000                           true,                                   /* copy the topo */
1001                           comm->c_local_group,                    /* local group */
1002                           remote_group );                         /* remote group */
1003     if ( OMPI_SUCCESS != rc) {
1004         return rc;
1005     }
1006 
1007     /* Determine context id. It is identical to f_2_c_handle */
1008     rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
1009     if ( OMPI_SUCCESS != rc ) {
1010         OBJ_RELEASE(newcomp);
1011         return rc;
1012     }
1013 
1014     /* Set name for debugging purposes */
1015     snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d DUP FROM %d",
1016              newcomp->c_contextid, comm->c_contextid );
1017 
1018     // Copy info if there is one.
1019     newcomp->super.s_info = OBJ_NEW(opal_info_t);
1020     if (info) {
1021         opal_info_dup(info, &(newcomp->super.s_info));
1022     }
1023 
1024     /* activate communicator and init coll-module */
1025     rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
1026     if ( OMPI_SUCCESS != rc ) {
1027         OBJ_RELEASE(newcomp);
1028         return rc;
1029     }
1030 
1031     *newcomm = newcomp;
1032     return MPI_SUCCESS;
1033 }
1034 
1035 struct ompi_comm_idup_with_info_context_t {
1036     opal_object_t super;
1037     ompi_communicator_t *comm;
1038     ompi_communicator_t *newcomp;
1039 };
1040 
1041 typedef struct ompi_comm_idup_with_info_context_t ompi_comm_idup_with_info_context_t;
1042 OBJ_CLASS_INSTANCE(ompi_comm_idup_with_info_context_t, opal_object_t, NULL, NULL);
1043 
1044 static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request);
1045 static int ompi_comm_idup_with_info_finish (ompi_comm_request_t *request);
1046 static int ompi_comm_idup_getcid (ompi_comm_request_t *request);
1047 
1048 int ompi_comm_idup (ompi_communicator_t *comm, ompi_communicator_t **newcomm, ompi_request_t **req)
1049 {
1050     return ompi_comm_idup_with_info (comm, NULL, newcomm, req);
1051 }
1052 
1053 int ompi_comm_idup_with_info (ompi_communicator_t *comm, opal_info_t *info, ompi_communicator_t **newcomm, ompi_request_t **req)
1054 {
1055     return ompi_comm_idup_internal (comm, comm->c_local_group, comm->c_remote_group, info, newcomm, req);
1056 }
1057 
1058 /* NTH: we need a way to idup with a smaller local group so this function takes a local group */
1059 static int ompi_comm_idup_internal (ompi_communicator_t *comm, ompi_group_t *group, ompi_group_t *remote_group,
1060                                     opal_info_t *info, ompi_communicator_t **newcomm, ompi_request_t **req)
1061 {
1062     ompi_comm_idup_with_info_context_t *context;
1063     ompi_comm_request_t *request;
1064     ompi_request_t *subreq[1];
1065     int rc;
1066 
1067     *newcomm = MPI_COMM_NULL;
1068 
1069     if (!OMPI_COMM_IS_INTER (comm)){
1070         remote_group = NULL;
1071     }
1072 
1073     request = ompi_comm_request_get ();
1074     if (NULL == request) {
1075         return OMPI_ERR_OUT_OF_RESOURCE;
1076     }
1077 
1078     context = OBJ_NEW(ompi_comm_idup_with_info_context_t);
1079     if (NULL == context) {
1080         ompi_comm_request_return (request);
1081         return OMPI_ERR_OUT_OF_RESOURCE;
1082     }
1083 
1084     context->comm    = comm;
1085 
1086     request->context = &context->super;
1087 
1088     rc =  ompi_comm_set_nb (&context->newcomp,                      /* new comm */
1089                             comm,                                   /* old comm */
1090                             0,                                      /* local array size */
1091                             NULL,                                   /* local_procs */
1092                             0,                                      /* remote array size */
1093                             NULL,                                   /* remote_procs */
1094                             comm->c_keyhash,                        /* attrs */
1095                             comm->error_handler,                    /* error handler */
1096                             true,                                   /* copy the topo */
1097                             group,                                  /* local group */
1098                             remote_group,                           /* remote group */
1099                             subreq);                                /* new subrequest */
1100     if (OMPI_SUCCESS != rc) {
1101         ompi_comm_request_return (request);
1102         return rc;
1103     }
1104 
1105     // Copy info if there is one.
1106     {
1107         ompi_communicator_t *newcomp = context->newcomp;
1108         newcomp->super.s_info = OBJ_NEW(opal_info_t);
1109         if (info) {
1110             opal_info_dup(info, &(newcomp->super.s_info));
1111         }
1112     }
1113 
1114     ompi_comm_request_schedule_append (request, ompi_comm_idup_getcid, subreq, subreq[0] ? 1 : 0);
1115 
1116     /* assign the newcomm now */
1117     *newcomm = context->newcomp;
1118 
1119     /* kick off the request */
1120     ompi_comm_request_start (request);
1121     *req = &request->super;
1122 
1123     return OMPI_SUCCESS;
1124 }
1125 
1126 static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
1127 {
1128     ompi_comm_idup_with_info_context_t *context =
1129         (ompi_comm_idup_with_info_context_t *) request->context;
1130     ompi_request_t *subreq[1];
1131     int rc, mode;
1132 
1133     if (OMPI_COMM_IS_INTER(context->comm)){
1134         mode  = OMPI_COMM_CID_INTER;
1135     } else {
1136         mode  = OMPI_COMM_CID_INTRA;
1137     }
1138 
1139     /* Determine context id. It is identical to f_2_c_handle */
1140     rc = ompi_comm_nextcid_nb (context->newcomp, context->comm, NULL, NULL,
1141                                NULL, false, mode, subreq);
1142     if (OMPI_SUCCESS != rc) {
1143         ompi_comm_request_return (request);
1144         OBJ_RELEASE(context->newcomp);
1145         return rc;
1146     }
1147 
1148     ompi_comm_request_schedule_append (request, ompi_comm_idup_with_info_activate, subreq, 1);
1149 
1150     return OMPI_SUCCESS;
1151 }
1152 
1153 static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
1154 {
1155     ompi_comm_idup_with_info_context_t *context =
1156         (ompi_comm_idup_with_info_context_t *) request->context;
1157     ompi_request_t *subreq[1];
1158     int rc, mode;
1159 
1160     if (OMPI_COMM_IS_INTER(context->comm)){
1161         mode  = OMPI_COMM_CID_INTER;
1162     } else {
1163         mode  = OMPI_COMM_CID_INTRA;
1164     }
1165 
1166     /* Set name for debugging purposes */
1167     snprintf(context->newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d DUP FROM %d",
1168              context->newcomp->c_contextid, context->comm->c_contextid );
1169 
1170     /* activate communicator and init coll-module */
1171     rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
1172     if ( OMPI_SUCCESS != rc ) {
1173         OBJ_RELEASE(context->newcomp);
1174         return rc;
1175     }
1176 
1177     ompi_comm_request_schedule_append (request, ompi_comm_idup_with_info_finish, subreq, 1);
1178 
1179     return OMPI_SUCCESS;
1180 }
1181 
1182 static int ompi_comm_idup_with_info_finish (ompi_comm_request_t *request)
1183 {
1184     /* done */
1185     return MPI_SUCCESS;
1186 }
1187 
1188 /**********************************************************************/
1189 /**********************************************************************/
1190 /**********************************************************************/
1191 int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int tag, ompi_communicator_t **newcomm)
1192 {
1193     ompi_communicator_t *newcomp = NULL;
1194     int mode = OMPI_COMM_CID_GROUP, rc = OMPI_SUCCESS;
1195 
1196     *newcomm = MPI_COMM_NULL;
1197 
1198     rc =  ompi_comm_set ( &newcomp,                               /* new comm */
1199                           comm,                                   /* old comm */
1200                           group->grp_proc_count,                  /* local_size */
1201                           NULL,                                   /* local_procs*/
1202                           0,                                      /* remote_size */
1203                           NULL,                                   /* remote_procs */
1204                           comm->c_keyhash,                        /* attrs */
1205                           comm->error_handler,                    /* error handler */
1206                           true,                                   /* copy the topo */
1207                           group,                                  /* local group */
1208                           NULL);                                  /* remote group */
1209     if ( OMPI_SUCCESS != rc) {
1210         return rc;
1211     }
1212 
1213     /* Determine context id. It is identical to f_2_c_handle */
1214     rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
1215     if ( OMPI_SUCCESS != rc ) {
1216         OBJ_RELEASE(newcomp);
1217         return rc;
1218     }
1219 
1220     /* Set name for debugging purposes */
1221     snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d GROUP FROM %d",
1222              newcomp->c_contextid, comm->c_contextid );
1223 
1224     /* activate communicator and init coll-module */
1225     rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
1226     if ( OMPI_SUCCESS != rc ) {
1227         OBJ_RELEASE(newcomp);
1228         return rc;
1229     }
1230 
1231     *newcomm = newcomp;
1232     return MPI_SUCCESS;
1233 }
1234 
1235 /**********************************************************************/
1236 /**********************************************************************/
1237 /**********************************************************************/
1238 int ompi_comm_compare(ompi_communicator_t *comm1, ompi_communicator_t *comm2, int *result) {
1239     /* local variables */
1240     ompi_communicator_t *comp1, *comp2;
1241     int size1, size2, rsize1, rsize2;
1242     int lresult, rresult=MPI_CONGRUENT;
1243     int cmp_result;
1244 
1245     comp1 = (ompi_communicator_t *) comm1;
1246     comp2 = (ompi_communicator_t *) comm2;
1247 
1248     if ( comp1->c_contextid == comp2->c_contextid ) {
1249         *result = MPI_IDENT;
1250         return MPI_SUCCESS;
1251     }
1252 
1253     if ( MPI_COMM_NULL == comm1 || MPI_COMM_NULL == comm2 ) {
1254         *result = MPI_UNEQUAL;
1255         return MPI_SUCCESS;
1256     }
1257 
1258     /* compare sizes of local and remote groups */
1259     size1 = ompi_comm_size (comp1);
1260     size2 = ompi_comm_size (comp2);
1261     rsize1 = ompi_comm_remote_size (comp1);
1262     rsize2 = ompi_comm_remote_size (comp2);
1263 
1264     if ( size1 != size2 || rsize1 != rsize2 ) {
1265         *result = MPI_UNEQUAL;
1266         return MPI_SUCCESS;
1267     }
1268 
1269     /* Compare local groups */
1270     ompi_group_compare((ompi_group_t *)comp1->c_local_group,
1271                        (ompi_group_t *)comp2->c_local_group,
1272                        &cmp_result);
1273 
1274     /* MPI_IDENT resulting from the group comparison is
1275      * MPI_CONGRUENT for communicators.
1276      * All others results are the same.
1277      */
1278     if( MPI_IDENT == cmp_result ) {
1279         lresult = MPI_CONGRUENT;
1280     } else {
1281         lresult = cmp_result;
1282     }
1283 
1284 
1285     if ( rsize1 > 0 ) {
1286         /* Compare remote groups for inter-communicators */
1287         ompi_group_compare((ompi_group_t *)comp1->c_remote_group,
1288                            (ompi_group_t *)comp2->c_remote_group,
1289                            &cmp_result);
1290 
1291         /* MPI_IDENT resulting from the group comparison is
1292          * MPI_CONGRUENT for communicators.
1293          * All others results are the same.
1294          */
1295         if( MPI_IDENT == cmp_result ) {
1296             rresult = MPI_CONGRUENT;
1297         } else {
1298             rresult = cmp_result;
1299         }
1300     }
1301 
1302     /* determine final results */
1303     if ( MPI_CONGRUENT == rresult ) {
1304         *result = lresult;
1305     }
1306     else if ( MPI_SIMILAR == rresult ) {
1307         if ( MPI_SIMILAR == lresult || MPI_CONGRUENT == lresult ) {
1308             *result = MPI_SIMILAR;
1309         }
1310         else {
1311             *result = MPI_UNEQUAL;
1312         }
1313     }
1314     else if ( MPI_UNEQUAL == rresult ) {
1315         *result = MPI_UNEQUAL;
1316     }
1317 
1318     return OMPI_SUCCESS;
1319 }
1320 /**********************************************************************/
1321 /**********************************************************************/
1322 /**********************************************************************/
1323 int ompi_comm_set_name (ompi_communicator_t *comm, const char *name )
1324 {
1325 
1326     OPAL_THREAD_LOCK(&(comm->c_lock));
1327     opal_string_copy(comm->c_name, name, MPI_MAX_OBJECT_NAME);
1328     comm->c_flags |= OMPI_COMM_NAMEISSET;
1329     OPAL_THREAD_UNLOCK(&(comm->c_lock));
1330 
1331     return OMPI_SUCCESS;
1332 }
1333 /**********************************************************************/
1334 /**********************************************************************/
1335 /**********************************************************************/
1336 /*
1337  * Implementation of MPI_Allgather for the local_group in an inter-comm.
1338  * The algorithm consists of two steps:
1339  * 1. an inter-gather to rank 0 in remote group
1340  * 2. an inter-bcast from rank 0 in remote_group.
1341  */
1342 
1343 static int ompi_comm_allgather_emulate_intra( void *inbuf, int incount,
1344                                               MPI_Datatype intype, void* outbuf,
1345                                               int outcount, MPI_Datatype outtype,
1346                                               ompi_communicator_t *comm,
1347                                               mca_coll_base_module_t *data)
1348 {
1349     int rank, size, rsize, i, rc;
1350     int *tmpbuf=NULL;
1351     MPI_Request *req=NULL, sendreq;
1352 
1353     rsize = ompi_comm_remote_size(comm);
1354     size  = ompi_comm_size(comm);
1355     rank  = ompi_comm_rank(comm);
1356 
1357     /* silence clang warning about 0-byte malloc. neither of these values can
1358      * be 0 here */
1359     if (OPAL_UNLIKELY(0 == rsize || 0 == outcount)) {
1360         return OMPI_ERR_BAD_PARAM;
1361     }
1362 
1363     /* Step 1: the gather-step */
1364     if ( 0 == rank ) {
1365         tmpbuf = (int *) malloc (rsize*outcount*sizeof(int));
1366         if ( NULL == tmpbuf ) {
1367             return (OMPI_ERR_OUT_OF_RESOURCE);
1368         }
1369         req = (MPI_Request *)malloc (rsize*outcount*sizeof(MPI_Request));
1370         if ( NULL == req ) {
1371             free ( tmpbuf );
1372             return (OMPI_ERR_OUT_OF_RESOURCE);
1373         }
1374 
1375         for ( i=0; i<rsize; i++) {
1376             rc = MCA_PML_CALL(irecv( &tmpbuf[outcount*i], outcount, outtype, i,
1377                                      OMPI_COMM_ALLGATHER_TAG, comm, &req[i] ));
1378             if ( OMPI_SUCCESS != rc ) {
1379                 goto exit;
1380             }
1381         }
1382     }
1383     rc = MCA_PML_CALL(isend( inbuf, incount, intype, 0, OMPI_COMM_ALLGATHER_TAG,
1384                              MCA_PML_BASE_SEND_STANDARD, comm, &sendreq ));
1385     if ( OMPI_SUCCESS != rc ) {
1386         goto exit;
1387     }
1388 
1389     if ( 0 == rank ) {
1390         rc = ompi_request_wait_all( rsize, req, MPI_STATUSES_IGNORE);
1391         if ( OMPI_SUCCESS != rc ) {
1392             goto exit;
1393         }
1394     }
1395 
1396     rc = ompi_request_wait( &sendreq, MPI_STATUS_IGNORE);
1397     if ( OMPI_SUCCESS != rc ) {
1398         goto exit;
1399     }
1400 
1401     /* Step 2: the inter-bcast step */
1402     rc = MCA_PML_CALL(irecv (outbuf, size*outcount, outtype, 0,
1403                              OMPI_COMM_ALLGATHER_TAG, comm, &sendreq));
1404     if ( OMPI_SUCCESS != rc ) {
1405         goto exit;
1406     }
1407 
1408     if ( 0 == rank ) {
1409         for ( i=0; i < rsize; i++ ){
1410             rc = MCA_PML_CALL(send (tmpbuf, rsize*outcount, outtype, i,
1411                                     OMPI_COMM_ALLGATHER_TAG,
1412                                     MCA_PML_BASE_SEND_STANDARD, comm));
1413             if ( OMPI_SUCCESS != rc ) {
1414                 goto exit;
1415             }
1416         }
1417     }
1418 
1419     rc = ompi_request_wait( &sendreq, MPI_STATUS_IGNORE );
1420 
1421  exit:
1422     if ( NULL != req ) {
1423         free ( req );
1424     }
1425     if ( NULL != tmpbuf ) {
1426         free ( tmpbuf );
1427     }
1428 
1429     return (rc);
1430 }
1431 /**********************************************************************/
1432 /**********************************************************************/
1433 /**********************************************************************/
1434 /*
1435 ** Counterpart to MPI_Comm_free. To be used within OMPI.
1436 ** The freeing of all attached objects (groups, errhandlers
1437 ** etc. ) has moved to the destructor.
1438 */
1439 int ompi_comm_free( ompi_communicator_t **comm )
1440 {
1441     int ret;
1442     int cid = (*comm)->c_contextid;
1443     int is_extra_retain = OMPI_COMM_IS_EXTRA_RETAIN(*comm);
1444 
1445     /* Release attributes.  We do this now instead of during the
1446        communicator destructor for 2 reasons:
1447 
1448        1. The destructor will only NOT be called immediately during
1449        ompi_comm_free() if the reference count is still greater
1450        than zero at that point, meaning that there are ongoing
1451        communications.  However, pending communications will never
1452        need attributes, so it's safe to release them directly here.
1453 
1454        2. Releasing attributes in ompi_comm_free() enables us to check
1455        the return status of the attribute delete functions.  At
1456        least one interpretation of the MPI standard (i.e., the one
1457        of the Intel test suite) is that if any of the attribute
1458        deletion functions fail, then MPI_COMM_FREE /
1459        MPI_COMM_DISCONNECT should also fail.  We can't do that if
1460        we delay releasing the attributes -- we need to release the
1461        attributes right away so that we can report the error right
1462        away. */
1463     if (NULL != (*comm)->c_keyhash) {
1464         ret = ompi_attr_delete_all(COMM_ATTR, *comm, (*comm)->c_keyhash);
1465         if (OMPI_SUCCESS != ret) {
1466             return ret;
1467         }
1468         OBJ_RELEASE((*comm)->c_keyhash);
1469     }
1470 
1471     if ( OMPI_COMM_IS_INTER(*comm) ) {
1472         if ( ! OMPI_COMM_IS_INTRINSIC((*comm)->c_local_comm)) {
1473             ompi_comm_free (&(*comm)->c_local_comm);
1474         }
1475     }
1476 
1477     /* Special case: if we are freeing the parent handle, then we need
1478        to set our internal handle to the parent to be equal to
1479        COMM_NULL.  This is according to MPI-2:88-89. */
1480 
1481     if (*comm == ompi_mpi_comm_parent && comm != &ompi_mpi_comm_parent) {
1482         ompi_mpi_comm_parent = &ompi_mpi_comm_null.comm;
1483     }
1484 
1485     if (NULL != ((*comm)->super.s_info)) {
1486         OBJ_RELEASE((*comm)->super.s_info);
1487     }
1488 
1489     /* Release the communicator */
1490     if ( OMPI_COMM_IS_DYNAMIC (*comm) ) {
1491         ompi_comm_num_dyncomm --;
1492     }
1493     OBJ_RELEASE( (*comm) );
1494 
1495     if ( is_extra_retain) {
1496         /* This communicator has been marked as an "extra retain"
1497          * communicator. This can happen if a communicator creates
1498          * 'dependent' subcommunicators (e.g. for inter
1499          * communicators or when using hierarch collective
1500          * module *and* the cid of the dependent communicator
1501          * turned out to be lower than of the parent one.
1502          * In that case, the reference counter has been increased
1503          * by one more, in order to handle the scenario,
1504          * that the user did not free the communicator.
1505          * Note, that if we enter this routine, we can
1506          * decrease the counter by one more therefore. However,
1507          * in ompi_comm_finalize, we only used OBJ_RELEASE instead
1508          * of ompi_comm_free(), and the increased reference counter
1509          * makes sure that the pointer to the dependent communicator
1510          * still contains a valid object.
1511          */
1512         ompi_communicator_t *tmpcomm = (ompi_communicator_t *) opal_pointer_array_get_item(&ompi_mpi_communicators, cid);
1513         if ( NULL != tmpcomm ){
1514             ompi_comm_free(&tmpcomm);
1515         }
1516     }
1517 
1518     *comm = MPI_COMM_NULL;
1519     return OMPI_SUCCESS;
1520 }
1521 
1522 /**********************************************************************/
1523 /**********************************************************************/
1524 /**********************************************************************/
1525 int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
1526                            ompi_communicator_t *bridge_comm,
1527                            int local_leader,
1528                            int remote_leader,
1529                            int tag,
1530                            int rsize,
1531                            ompi_proc_t ***prprocs )
1532 {
1533     MPI_Request req;
1534     int rc = OMPI_SUCCESS;
1535     int local_rank, local_size;
1536     ompi_proc_t **rprocs=NULL;
1537     int32_t size_len;
1538     int int_len=0, rlen;
1539     opal_buffer_t *sbuf=NULL, *rbuf=NULL;
1540     void *sendbuf=NULL;
1541     char *recvbuf;
1542     ompi_proc_t **proc_list=NULL;
1543     int i;
1544 
1545     local_rank = ompi_comm_rank (local_comm);
1546     local_size = ompi_comm_size (local_comm);
1547 
1548     if (local_rank == local_leader) {
1549         sbuf = OBJ_NEW(opal_buffer_t);
1550         if (NULL == sbuf) {
1551             rc = OMPI_ERR_OUT_OF_RESOURCE;
1552             goto err_exit;
1553         }
1554         if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
1555             rc = ompi_proc_pack(local_comm->c_local_group->grp_proc_pointers,
1556                                 local_size, sbuf);
1557         }
1558         /* get the proc list for the sparse implementations */
1559         else {
1560             proc_list = (ompi_proc_t **) calloc (local_comm->c_local_group->grp_proc_count,
1561                                                  sizeof (ompi_proc_t *));
1562             for(i=0 ; i<local_comm->c_local_group->grp_proc_count ; i++)
1563                 proc_list[i] = ompi_group_peer_lookup(local_comm->c_local_group,i);
1564             rc = ompi_proc_pack (proc_list, local_size, sbuf);
1565         }
1566         if ( OMPI_SUCCESS != rc ) {
1567             goto err_exit;
1568         }
1569         if (OPAL_SUCCESS != (rc = opal_dss.unload(sbuf, &sendbuf, &size_len))) {
1570             goto err_exit;
1571         }
1572 
1573         /* send the remote_leader the length of the buffer */
1574         rc = MCA_PML_CALL(irecv (&rlen, 1, MPI_INT, remote_leader, tag,
1575                                  bridge_comm, &req ));
1576         if ( OMPI_SUCCESS != rc ) {
1577             goto err_exit;
1578         }
1579         int_len = (int)size_len;
1580 
1581         rc = MCA_PML_CALL(send (&int_len, 1, MPI_INT, remote_leader, tag,
1582                                 MCA_PML_BASE_SEND_STANDARD, bridge_comm ));
1583         if ( OMPI_SUCCESS != rc ) {
1584             goto err_exit;
1585         }
1586         rc = ompi_request_wait( &req, MPI_STATUS_IGNORE );
1587         if ( OMPI_SUCCESS != rc ) {
1588             goto err_exit;
1589         }
1590     }
1591 
1592     /* broadcast buffer length to all processes in local_comm */
1593     rc = local_comm->c_coll->coll_bcast( &rlen, 1, MPI_INT,
1594                                         local_leader, local_comm,
1595                                         local_comm->c_coll->coll_bcast_module );
1596     if ( OMPI_SUCCESS != rc ) {
1597         goto err_exit;
1598     }
1599 
1600     /* Allocate temporary buffer */
1601     recvbuf = (char *)malloc(rlen);
1602     if ( NULL == recvbuf ) {
1603         rc = OMPI_ERR_OUT_OF_RESOURCE;
1604         goto err_exit;
1605     }
1606 
1607     if ( local_rank == local_leader ) {
1608         /* local leader exchange name lists */
1609         rc = MCA_PML_CALL(irecv (recvbuf, rlen, MPI_BYTE, remote_leader, tag,
1610                                  bridge_comm, &req ));
1611         if ( OMPI_SUCCESS != rc ) {
1612             goto err_exit;
1613         }
1614         rc = MCA_PML_CALL(send(sendbuf, int_len, MPI_BYTE, remote_leader, tag,
1615                                MCA_PML_BASE_SEND_STANDARD, bridge_comm ));
1616         if ( OMPI_SUCCESS != rc ) {
1617             goto err_exit;
1618         }
1619         rc = ompi_request_wait( &req, MPI_STATUS_IGNORE );
1620         if ( OMPI_SUCCESS != rc ) {
1621             goto err_exit;
1622         }
1623     }
1624 
1625     /* broadcast name list to all proceses in local_comm */
1626     rc = local_comm->c_coll->coll_bcast( recvbuf, rlen, MPI_BYTE,
1627                                         local_leader, local_comm,
1628                                         local_comm->c_coll->coll_bcast_module);
1629     if ( OMPI_SUCCESS != rc ) {
1630         goto err_exit;
1631     }
1632 
1633     rbuf = OBJ_NEW(opal_buffer_t);
1634     if (NULL == rbuf) {
1635         rc = OMPI_ERR_OUT_OF_RESOURCE;
1636         goto err_exit;
1637     }
1638 
1639     if (OMPI_SUCCESS != (rc = opal_dss.load(rbuf, recvbuf, rlen))) {
1640         goto err_exit;
1641     }
1642 
1643     /* decode the names into a proc-list -- will never add a new proc
1644        as the result of this operation, so no need to get the newprocs
1645        list or call PML add_procs(). */
1646     rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
1647     OBJ_RELEASE(rbuf);
1648     if (OMPI_SUCCESS != rc) {
1649         goto err_exit;
1650     }
1651 
1652     /* set the locality of the remote procs */
1653     for (i=0; i < rsize; i++) {
1654         /* get the locality information - all RTEs are required
1655          * to provide this information at startup */
1656         uint16_t *u16ptr, u16;
1657         u16ptr = &u16;
1658         OPAL_MODEX_RECV_VALUE(rc, OPAL_PMIX_LOCALITY, &rprocs[i]->super.proc_name, &u16ptr, OPAL_UINT16);
1659         if (OPAL_SUCCESS == rc) {
1660             rprocs[i]->super.proc_flags = u16;
1661         } else {
1662             rprocs[i]->super.proc_flags = OPAL_PROC_NON_LOCAL;
1663         }
1664     }
1665 
1666     /* And now add the information into the database */
1667     if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
1668         goto err_exit;
1669     }
1670 
1671  err_exit:
1672     /* rprocs isn't freed unless we have an error,
1673        since it is used in the communicator */
1674     if ( OMPI_SUCCESS != rc ) {
1675         OMPI_ERROR_LOG(rc);
1676         opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
1677         if ( NULL != rprocs ) {
1678             free ( rprocs );
1679             rprocs=NULL;
1680         }
1681     }
1682     /* make sure the buffers have been released */
1683     if (NULL != sbuf) {
1684         OBJ_RELEASE(sbuf);
1685     }
1686     if (NULL != rbuf) {
1687         OBJ_RELEASE(rbuf);
1688     }
1689     if ( NULL != proc_list ) {
1690         free ( proc_list );
1691     }
1692     if (NULL != sendbuf) {
1693         free ( sendbuf );
1694     }
1695 
1696     *prprocs = rprocs;
1697     return rc;
1698 }
1699 /**********************************************************************/
1700 /**********************************************************************/
1701 /**********************************************************************/
1702 /**
1703  * This routine verifies, whether local_group and remote group are overlapping
1704  * in intercomm_create
1705  */
1706 int ompi_comm_overlapping_groups (int size, ompi_proc_t **lprocs,
1707                                   int rsize, ompi_proc_t ** rprocs)
1708 
1709 {
1710     int rc=OMPI_SUCCESS;
1711     int i,j;
1712 
1713     for (i=0; i<size; i++) {
1714         for ( j=0; j<rsize; j++) {
1715             if ( lprocs[i] == rprocs[j] ) {
1716                 rc = MPI_ERR_COMM;
1717                 return rc;
1718             }
1719         }
1720     }
1721 
1722     return rc;
1723 }
1724 /**********************************************************************/
1725 /**********************************************************************/
1726 /**********************************************************************/
1727 int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high )
1728 {
1729     int flag, rhigh;
1730     int rank, rsize;
1731     int *rcounts;
1732     int *rdisps;
1733     int scount=0;
1734     int rc;
1735     ompi_proc_t *ourproc, *theirproc;
1736     ompi_rte_cmp_bitmask_t mask;
1737 
1738     rank = ompi_comm_rank        (intercomm);
1739     rsize= ompi_comm_remote_size (intercomm);
1740 
1741     /* silence clang warnings. rsize can not be 0 here */
1742     if (OPAL_UNLIKELY(0 == rsize)) {
1743         return OMPI_ERR_BAD_PARAM;
1744     }
1745 
1746     rdisps  = (int *) calloc ( rsize, sizeof(int));
1747     if ( NULL == rdisps ){
1748         return OMPI_ERR_OUT_OF_RESOURCE;
1749     }
1750 
1751     rcounts = (int *) calloc ( rsize, sizeof(int));
1752     if ( NULL == rcounts ){
1753         free (rdisps);
1754         return OMPI_ERR_OUT_OF_RESOURCE;
1755     }
1756 
1757     rcounts[0] = 1;
1758     if ( 0 == rank ) {
1759         scount = 1;
1760     }
1761 
1762     rc = intercomm->c_coll->coll_allgatherv(&high, scount, MPI_INT,
1763                                            &rhigh, rcounts, rdisps,
1764                                            MPI_INT, intercomm,
1765                                            intercomm->c_coll->coll_allgatherv_module);
1766     if ( NULL != rdisps ) {
1767         free ( rdisps );
1768     }
1769     if ( NULL != rcounts ) {
1770         free ( rcounts );
1771     }
1772 
1773     if ( rc != OMPI_SUCCESS ) {
1774         return rc;
1775     }
1776 
1777     /* This is the logic for determining who is first, who is second */
1778     if ( high && !rhigh ) {
1779         flag = false;
1780     }
1781     else if ( !high && rhigh ) {
1782         flag = true;
1783     }
1784     else {
1785         ourproc   = ompi_group_peer_lookup(intercomm->c_local_group,0);
1786         theirproc = ompi_group_peer_lookup(intercomm->c_remote_group,0);
1787 
1788         mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
1789         rc = ompi_rte_compare_name_fields(mask, (const ompi_process_name_t*)&(ourproc->super.proc_name),
1790                                                 (const ompi_process_name_t*)&(theirproc->super.proc_name));
1791         if ( 0 > rc ) {
1792             flag = true;
1793         }
1794         else {
1795             flag = false;
1796         }
1797     }
1798 
1799     return flag;
1800 }
1801 /********************************************************************************/
1802 /********************************************************************************/
1803 /********************************************************************************/
1804 int ompi_comm_dump ( ompi_communicator_t *comm )
1805 {
1806     opal_output(0, "Dumping information for comm_cid %d\n", comm->c_contextid);
1807     opal_output(0,"  f2c index:%d cube_dim: %d\n", comm->c_f_to_c_index,
1808                 comm->c_cube_dim);
1809     opal_output(0,"  Local group: size = %d my_rank = %d\n",
1810                 comm->c_local_group->grp_proc_count,
1811                 comm->c_local_group->grp_my_rank );
1812 
1813     opal_output(0,"  Communicator is:");
1814     /* Display flags */
1815     if ( OMPI_COMM_IS_INTER(comm) )
1816         opal_output(0," inter-comm,");
1817     if ( OMPI_COMM_IS_CART(comm))
1818         opal_output(0," topo-cart");
1819     else if ( OMPI_COMM_IS_GRAPH(comm))
1820         opal_output(0," topo-graph");
1821     else if ( OMPI_COMM_IS_DIST_GRAPH(comm))
1822         opal_output(0," topo-dist-graph");
1823      opal_output(0,"\n");
1824 
1825     if (OMPI_COMM_IS_INTER(comm)) {
1826         opal_output(0,"  Remote group size:%d\n", comm->c_remote_group->grp_proc_count);
1827     }
1828     return OMPI_SUCCESS;
1829 }
1830 /********************************************************************************/
1831 /********************************************************************************/
1832 /********************************************************************************/
1833 /* static functions */
1834 /*
1835 ** rankkeygidcompare() compares a tuple of (rank,key,gid) producing
1836 ** sorted lists that match the rules needed for a MPI_Comm_split
1837 */
1838 static int rankkeycompare (const void *p, const void *q)
1839 {
1840     int *a, *b;
1841 
1842     /* ranks at [0] key at [1] */
1843     /* i.e. we cast and just compare the keys and then the original ranks.. */
1844     a = (int*)p;
1845     b = (int*)q;
1846 
1847     /* simple tests are those where the keys are different */
1848     if (a[1] < b[1]) {
1849         return (-1);
1850     }
1851     if (a[1] > b[1]) {
1852         return (1);
1853     }
1854 
1855     /* ok, if the keys are the same then we check the original ranks */
1856     if (a[1] == b[1]) {
1857         if (a[0] < b[0]) {
1858             return (-1);
1859         }
1860         if (a[0] == b[0]) {
1861             return (0);
1862         }
1863         if (a[0] > b[0]) {
1864             return (1);
1865         }
1866     }
1867     return ( 0 );
1868 }
1869 
1870 
1871 /***********************************************************************
1872  * Counterpart of MPI_Cart/Graph_create. This will be called from the
1873  * top level MPI. The condition for INTER communicator is already
1874  * checked by the time this has been invoked. This function should do
1875  * somewhat the same things which ompi_comm_create does. It will
1876  * however select a component for topology and then call the
1877  * cart_create on that component so that it can re-arrange the proc
1878  * structure as required (if the reorder flag is true). It will then
1879  * use this proc structure to create the communicator using
1880  * ompi_comm_set.
1881  */
1882 
1883 /**
1884  * Take an almost complete communicator and reserve the CID as well
1885  * as activate it (initialize the collective and the topologies).
1886  */
1887 int ompi_comm_enable(ompi_communicator_t *old_comm,
1888                      ompi_communicator_t *new_comm,
1889                      int new_rank,
1890                      int num_procs,
1891                      ompi_proc_t** topo_procs)
1892 {
1893     int ret = OMPI_SUCCESS;
1894 
1895     /* set the rank information before calling nextcid */
1896     new_comm->c_local_group->grp_my_rank = new_rank;
1897     new_comm->c_my_rank = new_rank;
1898 
1899     /* Determine context id. It is identical to f_2_c_handle */
1900     ret = ompi_comm_nextcid (new_comm, old_comm, NULL, NULL, NULL, false,
1901                              OMPI_COMM_CID_INTRA);
1902     if (OMPI_SUCCESS != ret) {
1903         /* something wrong happened while setting the communicator */
1904         goto complete_and_return;
1905     }
1906 
1907     /* Now, the topology module has been selected and the group
1908      * which has the topology information has been created. All we
1909      * need to do now is to fill the rest of the information into the
1910      * communicator. The following steps are not just similar to
1911      * ompi_comm_set, but are actually the same */
1912 
1913     ret = ompi_comm_fill_rest(new_comm,                /* the communicator */
1914                               num_procs,               /* local size */
1915                               topo_procs,              /* process structure */
1916                               new_rank,                /* rank of the process */
1917                               old_comm->error_handler); /* error handler */
1918 
1919     if (OMPI_SUCCESS != ret) {
1920         /* something wrong happened while setting the communicator */
1921         goto complete_and_return;
1922     }
1923 
1924     ret = ompi_comm_activate (&new_comm, old_comm, NULL, NULL, NULL, false,
1925                               OMPI_COMM_CID_INTRA);
1926     if (OMPI_SUCCESS != ret) {
1927         /* something wrong happened while setting the communicator */
1928         goto complete_and_return;
1929     }
1930 
1931  complete_and_return:
1932     return ret;
1933 }
1934 
1935 static int ompi_comm_fill_rest(ompi_communicator_t *comm,
1936                                int num_procs,
1937                                ompi_proc_t **proc_pointers,
1938                                int my_rank,
1939                                ompi_errhandler_t *errh)
1940 {
1941     /* properly decrement the ref counts on the groups.
1942        We are doing this because this function is sort of a redo
1943        of what is done in comm.c. No need to decrement the ref
1944        count on the proc pointers
1945        This is just a quick fix, and will be looking for a
1946        better solution */
1947     if (comm->c_local_group) {
1948         OBJ_RELEASE( comm->c_local_group );
1949     }
1950 
1951     if (comm->c_remote_group) {
1952         OBJ_RELEASE( comm->c_remote_group );
1953     }
1954 
1955     /* allocate a group structure for the new communicator */
1956     comm->c_local_group = ompi_group_allocate_plist_w_procs (proc_pointers, num_procs);
1957 
1958     /* set the remote group to be the same as local group */
1959     comm->c_remote_group = comm->c_local_group;
1960     OBJ_RETAIN( comm->c_remote_group );
1961 
1962     /* set the rank information */
1963     comm->c_local_group->grp_my_rank = my_rank;
1964     comm->c_my_rank = my_rank;
1965 
1966     if( MPI_UNDEFINED != my_rank ) {
1967         /* verify whether to set the flag, that this comm
1968            contains process from more than one jobid. */
1969         ompi_dpm_mark_dyncomm (comm);
1970     }
1971 
1972     /* set the error handler */
1973     comm->error_handler = errh;
1974     OBJ_RETAIN (comm->error_handler);
1975 
1976     /* set name for debugging purposes */
1977     /* there is no cid at this stage ... make this right and make edgars
1978      * code call this function and remove dupli cde
1979      */
1980     snprintf (comm->c_name, MPI_MAX_OBJECT_NAME, "MPI_COMMUNICATOR %d",
1981               comm->c_contextid);
1982 
1983     /* determine the cube dimensions */
1984     comm->c_cube_dim = opal_cube_dim(comm->c_local_group->grp_proc_count);
1985 
1986     return OMPI_SUCCESS;
1987 }
1988 
1989 static int ompi_comm_copy_topo(ompi_communicator_t *oldcomm,
1990                                ompi_communicator_t *newcomm)
1991 {
1992     if( NULL == oldcomm->c_topo )
1993         return OMPI_ERR_NOT_FOUND;
1994 
1995     newcomm->c_topo = oldcomm->c_topo;
1996     OBJ_RETAIN(newcomm->c_topo);
1997     newcomm->c_flags |= newcomm->c_topo->type;
1998     return OMPI_SUCCESS;
1999 }

/* [<][>][^][v][top][bottom][index][help] */