root/ompi/mca/bml/r2/bml_r2.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. btl_exclusivity_compare
  2. mca_bml_r2_add_btls
  3. btl_bandwidth_compare
  4. mca_bml_r2_calculate_bandwidth_latency
  5. mca_bml_r2_allocate_endpoint
  6. mca_bml_r2_register_progress
  7. mca_bml_r2_endpoint_add_btl
  8. mca_bml_r2_compute_endpoint_metrics
  9. mca_bml_r2_add_proc
  10. mca_bml_r2_add_procs
  11. mca_bml_r2_del_procs
  12. bml_r2_remove_btl_progress
  13. mca_bml_r2_del_proc_btl
  14. mca_bml_r2_finalize
  15. mca_bml_r2_del_btl
  16. mca_bml_r2_add_btl
  17. mca_bml_r2_register
  18. mca_bml_r2_register_error
  19. mca_bml_r2_component_fini

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2016 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2006 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  14  *                         reserved.
  15  * Copyright (c) 2008-2016 Cisco Systems, Inc.  All rights reserved.
  16  * Copyright (c) 2013      Intel, Inc. All rights reserved
  17  * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
  18  * Copyright (c) 2014      Research Organization for Information Science
  19  *                         and Technology (RIST). All rights reserved.
  20  * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
  21  *                         reserved.
  22  * Copyright (c) 2016      Intel, Inc. All rights reserved.
  23  * $COPYRIGHT$
  24  *
  25  * Additional copyrights may follow
  26  *
  27  * $HEADER$
  28  */
  29 
  30 #include "ompi_config.h"
  31 
  32 #include <stdlib.h>
  33 #include <string.h>
  34 
  35 #include "opal/class/opal_bitmap.h"
  36 #include "opal/util/argv.h"
  37 #include "opal/util/show_help.h"
  38 #include "opal/util/output.h"
  39 #include "ompi/mca/bml/bml.h"
  40 #include "ompi/mca/bml/base/base.h"
  41 #include "opal/mca/btl/btl.h"
  42 #include "opal/mca/btl/base/base.h"
  43 #include "ompi/mca/bml/base/bml_base_btl.h"
  44 #include "bml_r2.h"
  45 #include "ompi/proc/proc.h"
  46 
  47 extern mca_bml_base_component_t mca_bml_r2_component;
  48 
  49 /* Names of all the BTL components that this BML is aware of */
  50 static char *btl_names = NULL;
  51 
  52 static int btl_exclusivity_compare(const void* arg1, const void* arg2)
  53 {
  54     mca_btl_base_module_t* btl1 = *(struct mca_btl_base_module_t**)arg1;
  55     mca_btl_base_module_t* btl2 = *(struct mca_btl_base_module_t**)arg2;
  56     if( btl1->btl_exclusivity > btl2->btl_exclusivity ) {
  57         return -1;
  58     } else if (btl1->btl_exclusivity == btl2->btl_exclusivity ) {
  59         return 0;
  60     } else {
  61         return 1;
  62     }
  63 }
  64 
  65 static int mca_bml_r2_add_btls( void )
  66 {
  67     int i;
  68     opal_list_t *btls = NULL;
  69     mca_btl_base_selected_module_t* selected_btl;
  70     size_t num_btls = 0;
  71     char **btl_names_argv = NULL;
  72 
  73     if(true == mca_bml_r2.btls_added) {
  74         return OMPI_SUCCESS;
  75     }
  76 
  77     /* build an array of r2s and r2 modules */
  78     btls = &mca_btl_base_modules_initialized;
  79     num_btls = opal_list_get_size(btls);
  80 
  81     mca_bml_r2.num_btl_modules = 0;
  82     mca_bml_r2.num_btl_progress = 0;
  83 
  84     mca_bml_r2.btl_modules = (mca_btl_base_module_t **)malloc(sizeof(mca_btl_base_module_t*) * num_btls);
  85     mca_bml_r2.btl_progress = (mca_btl_base_component_progress_fn_t*)malloc(sizeof(mca_btl_base_component_progress_fn_t) * num_btls);
  86 
  87     if (NULL == mca_bml_r2.btl_modules ||
  88         NULL == mca_bml_r2.btl_progress) {
  89         return OMPI_ERR_OUT_OF_RESOURCE;
  90     }
  91 
  92     OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
  93         mca_btl_base_module_t *btl = selected_btl->btl_module;
  94         mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
  95         for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
  96             if (0 ==
  97                 strcmp(btl_names_argv[i],
  98                        btl->btl_component->btl_version.mca_component_name)) {
  99                 break;
 100             }
 101         }
 102         if (NULL == btl_names_argv || NULL == btl_names_argv[i]) {
 103             opal_argv_append_nosize(&btl_names_argv,
 104                                     btl->btl_component->btl_version.mca_component_name);
 105         }
 106     }
 107     if (NULL != btl_names_argv) {
 108         btl_names = opal_argv_join(btl_names_argv, ' ');
 109         opal_argv_free(btl_names_argv);
 110     } else {
 111         btl_names = strdup("no devices available");
 112     }
 113 
 114     /* sort r2 list by exclusivity */
 115     qsort(mca_bml_r2.btl_modules,
 116           mca_bml_r2.num_btl_modules,
 117           sizeof(struct mca_btl_base_module_t*),
 118           btl_exclusivity_compare);
 119     mca_bml_r2.btls_added = true;
 120     return OMPI_SUCCESS;
 121 }
 122 
 123 static int btl_bandwidth_compare(const void *v1, const void *v2)
 124 {
 125     mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1,
 126                        *b2 = (mca_bml_base_btl_t*)v2;
 127 
 128     return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
 129 }
 130 
 131 static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency)
 132 {
 133     const size_t array_length = mca_bml_base_btl_array_get_size (btl_array);
 134 
 135     *latency = UINT_MAX;
 136     *total_bandwidth = 0.;
 137 
 138     for (size_t i = 0 ; i < array_length ; ++i) {
 139         mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i);
 140         mca_btl_base_module_t *btl = bml_btl->btl;
 141         *total_bandwidth += btl->btl_bandwidth;
 142         if (btl->btl_latency < *latency) {
 143             *latency = btl->btl_latency;
 144         }
 145     }
 146 }
 147 
 148 static mca_bml_base_endpoint_t *mca_bml_r2_allocate_endpoint (ompi_proc_t *proc) {
 149     mca_bml_base_endpoint_t *bml_endpoint;
 150 
 151     /* allocate bml specific proc data */
 152     bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
 153     if (NULL == bml_endpoint) {
 154         opal_output(0, "%s: unable to allocate resources", __func__);
 155         return NULL;
 156     }
 157 
 158     /* preallocate space in array for max number of r2s */
 159     mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
 160     mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
 161     mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
 162     bml_endpoint->btl_max_send_size = -1;
 163     bml_endpoint->btl_proc = proc;
 164 
 165     bml_endpoint->btl_flags_or = 0;
 166     return bml_endpoint;
 167 }
 168 
 169 static void mca_bml_r2_register_progress (mca_btl_base_module_t *btl, bool hp)
 170 {
 171     if (NULL != btl->btl_component->btl_progress) {
 172         bool found = false;
 173         size_t p;
 174 
 175         for (p = 0 ; p < mca_bml_r2.num_btl_progress ; ++p) {
 176             if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
 177                 found = true;
 178                 break;
 179             }
 180         }
 181 
 182         if (found == false || hp) {
 183             if (found == false) {
 184                 mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress++] =
 185                     btl->btl_component->btl_progress;
 186             }
 187 
 188             if (hp) {
 189                 opal_progress_register (btl->btl_component->btl_progress);
 190             } else {
 191                 opal_progress_register_lp (btl->btl_component->btl_progress);
 192             }
 193         }
 194     }
 195 }
 196 
 197 static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_endpoint_t *bml_endpoint,
 198                                         mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_endpoint)
 199 {
 200     mca_bml_base_btl_t* bml_btl = NULL;
 201     int btl_flags = btl->btl_flags;
 202     bool btl_in_use = false;
 203     size_t size;
 204 
 205     /* NTH: these flags should have been sanitized by the btl. Once that is verified these
 206      * checks can be safely removed. */
 207     if ((btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put)) {
 208         opal_output(0, "%s: The PUT flag is specified for"
 209                     " the %s BTL without any PUT function attached. Discard the flag !",
 210                     __func__,
 211                     btl->btl_component->btl_version.mca_component_name);
 212         btl_flags ^= MCA_BTL_FLAGS_PUT;
 213     }
 214     if ((btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get)) {
 215         opal_output(0, "%s: The GET flag is specified for"
 216                     " the %s BTL without any GET function attached. Discard the flag !",
 217                     __func__, btl->btl_component->btl_version.mca_component_name);
 218         btl_flags ^= MCA_BTL_FLAGS_GET;
 219     }
 220 
 221     if ((btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0) {
 222         /* If no protocol specified, we have 2 choices: we ignore the BTL
 223          * as we don't know which protocl to use, or we suppose that all
 224          * BTLs support the send protocol. This is really a btl error as
 225          * these flags should have been sanitized by the btl. */
 226         btl_flags |= MCA_BTL_FLAGS_SEND;
 227     }
 228 
 229     if (btl_flags & MCA_BTL_FLAGS_SEND) {
 230         /* dont allow an additional BTL with a lower exclusivity ranking */
 231         size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
 232         bml_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_send, size - 1);
 233 
 234         if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
 235             /* this btl has higher exclusivity than an existing btl or none exists */
 236 
 237             opal_output_verbose(1, opal_btl_base_framework.framework_output,
 238                                 "mca: bml: Using %s btl for send to %s on node %s",
 239                                 btl->btl_component->btl_version.mca_component_name,
 240                                 OMPI_NAME_PRINT(&proc->super.proc_name),
 241                                 proc->super.proc_hostname);
 242 
 243             /* cache the endpoint on the proc */
 244             if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
 245                 bml_btl = mca_bml_base_btl_array_insert (&bml_endpoint->btl_send);
 246                 bml_btl->btl = btl;
 247                 bml_btl->btl_endpoint = btl_endpoint;
 248                 bml_btl->btl_weight = 0;
 249                 bml_btl->btl_flags = btl_flags;
 250 
 251                 /**
 252                  * calculate the bitwise OR of the btl flags
 253                  */
 254                 bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
 255             } else {
 256                 opal_output_verbose(20, opal_btl_base_framework.framework_output,
 257                                     "mca: bml: Not using %s btl for send to %s on node %s "
 258                                     "because %s btl has higher exclusivity (%d > %d)",
 259                                     btl->btl_component->btl_version.mca_component_name,
 260                                     OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
 261                                     bml_btl->btl->btl_component->btl_version.mca_component_name,
 262                                     bml_btl->btl->btl_exclusivity,
 263                                     btl->btl_exclusivity);
 264             }
 265 
 266             btl_in_use = true;
 267         }
 268     }
 269 
 270     /* always add rdma endpoints if they support full rdma */
 271     if (((btl_in_use && (btl_flags & MCA_BTL_FLAGS_RDMA)) ||
 272          (btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) == (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) &&
 273         !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
 274           (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
 275         mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
 276 
 277         bml_btl_rdma->btl = btl;
 278         bml_btl_rdma->btl_endpoint = btl_endpoint;
 279         bml_btl_rdma->btl_weight = 0;
 280         bml_btl_rdma->btl_flags = btl_flags;
 281 
 282         if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
 283             bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
 284         }
 285 
 286         if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
 287             bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
 288         }
 289 
 290         btl_in_use = true;
 291     }
 292 
 293     return btl_in_use ? OMPI_SUCCESS : OMPI_ERR_NOT_AVAILABLE;
 294 }
 295 
 296 static void mca_bml_r2_compute_endpoint_metrics (mca_bml_base_endpoint_t *bml_endpoint)
 297 {
 298     double total_bandwidth = 0;
 299     uint32_t latency;
 300     size_t n_send, n_rdma;
 301 
 302     /* (1) determine the total bandwidth available across all btls
 303      *     note that we need to do this here, as we may already have btls configured
 304      * (2) determine the highest priority ranking for latency
 305      * (3) compute the maximum amount of bytes that can be send without any
 306      *     weighting. Once the left over is smaller than this number we will
 307      *     start using the weight to compute the correct amount.
 308      */
 309     n_send = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
 310     n_rdma = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
 311 
 312     /* sort BTLs in descending order according to bandwidth value */
 313     qsort (bml_endpoint->btl_send.bml_btls, n_send,
 314            sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
 315 
 316     bml_endpoint->btl_rdma_index = 0;
 317 
 318     mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
 319 
 320     /* (1) set the weight of each btl as a percentage of overall bandwidth
 321      * (2) copy all btl instances at the highest priority ranking into the
 322      *     list of btls used for first fragments
 323      */
 324     for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
 325         mca_bml_base_btl_t *bml_btl =
 326             mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
 327         mca_btl_base_module_t *btl = bml_btl->btl;
 328 
 329         /* compute weighting factor for this r2 */
 330         if(btl->btl_bandwidth > 0) {
 331             bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
 332         } else {
 333             bml_btl->btl_weight = (float)(1.0 / n_send);
 334         }
 335 
 336         /* check to see if this r2 is already in the array of r2s
 337          * used for first fragments - if not add it.
 338          */
 339         if(btl->btl_latency == latency) {
 340             mca_bml_base_btl_t* bml_btl_new =
 341                 mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
 342             *bml_btl_new = *bml_btl;
 343         }
 344 
 345         /* set endpoint max send size as min of available btls */
 346         if (bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
 347             bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
 348     }
 349 
 350     /* sort BTLs in descending order according to bandwidth value */
 351     qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
 352           sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
 353 
 354     mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
 355 
 356     /* set rdma btl weights */
 357     for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
 358         mca_bml_base_btl_t *bml_btl =
 359             mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
 360 
 361         /* compute weighting factor for this r2 */
 362         if (bml_btl->btl->btl_bandwidth > 0.0) {
 363             bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
 364         } else {
 365             bml_btl->btl_weight = (float)(1.0 / n_rdma);
 366         }
 367     }
 368 }
 369 
 370 static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
 371 {
 372     mca_bml_base_endpoint_t *bml_endpoint;
 373     /* at least one btl is in use */
 374     bool btl_in_use = false;
 375     int rc;
 376 
 377     if (OPAL_UNLIKELY(NULL == proc)) {
 378         return OMPI_ERR_BAD_PARAM;
 379     }
 380 
 381     /* check if this endpoint is already set up */
 382     if (NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
 383         OBJ_RETAIN(proc);
 384         return OMPI_SUCCESS;
 385     }
 386 
 387     /* add btls if not already done */
 388     if (OMPI_SUCCESS != (rc = mca_bml_r2_add_btls())) {
 389         return rc;
 390     }
 391 
 392     bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
 393     if (OPAL_UNLIKELY(NULL == bml_endpoint)) {
 394         return OMPI_ERR_OUT_OF_RESOURCE;
 395     }
 396 
 397     for (size_t p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
 398         mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
 399         struct mca_btl_base_endpoint_t *btl_endpoint = NULL;
 400 
 401         /* if the r2 can reach the destination proc it sets the
 402          * corresponding bit (proc index) in the reachable bitmap
 403          * and can return addressing information for each proc
 404          * that is passed back to the r2 on data transfer calls
 405          */
 406         rc = btl->btl_add_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint, NULL);
 407         if (OMPI_SUCCESS != rc || NULL == btl_endpoint) {
 408             /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
 409              * can take care of this task. */
 410             continue;
 411         }
 412 
 413         rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoint);
 414         if (OMPI_SUCCESS != rc) {
 415             btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint);
 416         } else {
 417             mca_bml_r2_register_progress (btl, true);
 418             btl_in_use = true;
 419         }
 420     }
 421 
 422     if (!btl_in_use) {
 423         proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
 424         OBJ_RELEASE(bml_endpoint);
 425         /* no btl is available for this proc */
 426         if (mca_bml_r2.show_unreach_errors) {
 427             opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
 428                             OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
 429                             (NULL != ompi_proc_local_proc->super.proc_hostname ?
 430                              ompi_proc_local_proc->super.proc_hostname : "unknown!"),
 431                             OMPI_NAME_PRINT(&(proc->super.proc_name)),
 432                             (NULL != proc->super.proc_hostname ?
 433                              proc->super.proc_hostname : "unknown!"),
 434                             btl_names);
 435         }
 436 
 437         return OMPI_ERR_UNREACH;
 438     }
 439 
 440     /* compute metrics for registered btls */
 441     mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
 442 
 443     /* do it last, for the lazy initialization check in bml_base_get* */
 444     opal_atomic_wmb();
 445     proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
 446 
 447     return OMPI_SUCCESS;
 448 }
 449 
 450 /*
 451  *   For each proc setup a datastructure that indicates the BTLs
 452  *   that can be used to reach the destination.
 453  *
 454  */
 455 
 456 static int mca_bml_r2_add_procs( size_t nprocs,
 457                                  struct ompi_proc_t** procs,
 458                                  struct opal_bitmap_t* reachable )
 459 {
 460     size_t n_new_procs = 0;
 461     struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;
 462     struct ompi_proc_t** new_procs = NULL;
 463     int rc, ret = OMPI_SUCCESS;
 464 
 465     if(0 == nprocs) {
 466         return OMPI_SUCCESS;
 467     }
 468 
 469     if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) {
 470         return rc;
 471     }
 472 
 473     /* Select only the procs that don't yet have the BML proc struct. This prevent
 474      * us from calling btl->add_procs several times on the same destination proc.
 475      */
 476     for (size_t p_index = 0 ; p_index < nprocs ; ++p_index) {
 477         struct ompi_proc_t* proc = procs[p_index];
 478 
 479         if(NULL !=  proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
 480             continue;  /* go to the next proc */
 481         }
 482         /* Allocate the new_procs on demand */
 483         if( NULL == new_procs ) {
 484             new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *));
 485             if( NULL == new_procs ) {
 486                 return OMPI_ERR_OUT_OF_RESOURCE;
 487             }
 488         }
 489         OBJ_RETAIN(proc);
 490         new_procs[n_new_procs++] = proc;
 491     }
 492 
 493     if ( 0 == n_new_procs ) {
 494         return OMPI_SUCCESS;
 495     }
 496 
 497     /* Starting from here we only work on the unregistered procs */
 498     procs = new_procs;
 499     nprocs = n_new_procs;
 500 
 501     /* attempt to add all procs to each r2 */
 502     btl_endpoints = (struct mca_btl_base_endpoint_t **)
 503         malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*));
 504     if (NULL == btl_endpoints) {
 505         free(new_procs);
 506         return OMPI_ERR_OUT_OF_RESOURCE;
 507     }
 508 
 509     for (size_t p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
 510         mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
 511         int btl_inuse = 0;
 512 
 513         /* if the r2 can reach the destination proc it sets the
 514          * corresponding bit (proc index) in the reachable bitmap
 515          * and can return addressing information for each proc
 516          * that is passed back to the r2 on data transfer calls
 517          */
 518         opal_bitmap_clear_all_bits(reachable);
 519         memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*));
 520 
 521         rc = btl->btl_add_procs(btl, n_new_procs, (opal_proc_t**)new_procs, btl_endpoints, reachable);
 522         if (OMPI_SUCCESS != rc) {
 523             /* This BTL encountered an error while adding procs. Continue in case some other
 524              * BTL(s) can be used. */
 525             continue;
 526         }
 527 
 528         /* for each proc that is reachable */
 529         for (size_t p = 0 ; p < n_new_procs ; ++p) {
 530             if (!opal_bitmap_is_set_bit(reachable, p)) {
 531                 continue;
 532             }
 533 
 534             ompi_proc_t *proc = new_procs[p];
 535             mca_bml_base_endpoint_t *bml_endpoint =
 536                 (mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 537 
 538             if (NULL == bml_endpoint) {
 539                 bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
 540                 proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
 541                 if (NULL == bml_endpoint) {
 542                     free(btl_endpoints);
 543                     free(new_procs);
 544                     return OPAL_ERR_OUT_OF_RESOURCE;
 545                 }
 546             }
 547 
 548             rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoints[p]);
 549             if (OMPI_SUCCESS != rc) {
 550                 btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
 551                 continue;
 552             }
 553 
 554             /* This BTL is in use, allow the progress registration */
 555             btl_inuse++;
 556         }
 557 
 558         mca_bml_r2_register_progress (btl, !!(btl_inuse));
 559     }
 560 
 561     free(btl_endpoints);
 562 
 563     /* iterate back through procs and compute metrics for registered r2s */
 564     for (size_t p = 0; p < n_new_procs ; ++p) {
 565         mca_bml_base_endpoint_t *bml_endpoint =
 566             (mca_bml_base_endpoint_t *) new_procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 567 
 568         /* skip over procs w/ no btl's registered */
 569         if (NULL != bml_endpoint) {
 570             mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
 571         }
 572     }
 573 
 574     /* see if we have a connection to everyone else */
 575     for(size_t p = 0; p < n_new_procs ; ++p) {
 576         ompi_proc_t *proc = new_procs[p];
 577 
 578         if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
 579             ret = OMPI_ERR_UNREACH;
 580             if (mca_bml_r2.show_unreach_errors) {
 581                 opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
 582                                OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
 583                                (NULL != ompi_proc_local_proc->super.proc_hostname ?
 584                                 ompi_proc_local_proc->super.proc_hostname : "unknown!"),
 585                                OMPI_NAME_PRINT(&(proc->super.proc_name)),
 586                                (NULL != proc->super.proc_hostname ?
 587                                 proc->super.proc_hostname : "unknown!"),
 588                                btl_names);
 589             }
 590 
 591             break;
 592         }
 593     }
 594 
 595     free(new_procs);
 596 
 597     return ret;
 598 }
 599 
 600 /*
 601  * iterate through each proc and notify any BTLs associated
 602  * with the proc that it is/has gone away
 603  */
 604 
 605 static int mca_bml_r2_del_procs(size_t nprocs,
 606                                 struct ompi_proc_t** procs)
 607 {
 608     for (size_t p = 0 ; p < nprocs ; ++p) {
 609         ompi_proc_t *proc = procs[p];
 610         mca_bml_base_endpoint_t *bml_endpoint =
 611             (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 612 
 613         if (!bml_endpoint) {
 614             /* NTH: I would think this is a developer bug and should not be ignored. */
 615             continue;
 616         }
 617 
 618         /* notify each btl that the proc is going away */
 619         size_t f_size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
 620         for (size_t f_index = 0 ; f_index < f_size ; ++f_index) {
 621             mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index);
 622             mca_btl_base_module_t *btl = bml_btl->btl;
 623 
 624             int rc = btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &bml_btl->btl_endpoint);
 625             if (OPAL_SUCCESS != rc) {
 626                 return rc;
 627             }
 628 
 629             /* The reference stored in btl_eager and btl_rdma will automatically
 630              * dissapear once the btl_array destructor is called. Thus, there is
 631              * no need for extra cleaning here.
 632              */
 633         }
 634 
 635         /* some btl endpoints may only be in the btl_rdma array. call del_procs on those as well */
 636         size_t r_size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
 637         for (size_t r_index = 0 ; r_index < r_size ; ++r_index) {
 638             mca_bml_base_btl_t *rdma_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_rdma, r_index);
 639             mca_btl_base_module_t *btl = rdma_btl->btl;
 640             bool needs_del = true;
 641 
 642             for (size_t f_index = 0 ; f_index < f_size ; ++f_index) {
 643                 mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_send, f_index);
 644                 if (bml_btl->btl_endpoint == rdma_btl->btl_endpoint) {
 645                     needs_del = false;
 646                     break;
 647                 }
 648             }
 649 
 650             if (needs_del) {
 651                 int rc = btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &rdma_btl->btl_endpoint);
 652                 if (OPAL_SUCCESS != rc) {
 653                     return rc;
 654                 }
 655             }
 656         }
 657 
 658         proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
 659 
 660         /* release the bml endpoint's reference to the proc */
 661         OBJ_RELEASE(proc);
 662 
 663         /* do any required cleanup */
 664         OBJ_RELEASE(bml_endpoint);
 665     }
 666 
 667     return OMPI_SUCCESS;
 668 }
 669 
 670 static inline int bml_r2_remove_btl_progress(mca_btl_base_module_t* btl)
 671 {
 672     unsigned int p;
 673 
 674     if(NULL == btl->btl_component->btl_progress) {
 675         return OMPI_SUCCESS;
 676     }
 677     for(p = 0; p < mca_bml_r2.num_btl_progress; p++) {
 678         if(btl->btl_component->btl_progress != mca_bml_r2.btl_progress[p])
 679             continue;
 680         opal_progress_unregister( btl->btl_component->btl_progress );
 681         if( p < (mca_bml_r2.num_btl_progress-1) ) {
 682             mca_bml_r2.btl_progress[p] = mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress-1];
 683         }
 684         mca_bml_r2.num_btl_progress--;
 685         return OMPI_SUCCESS;
 686     }
 687     return OMPI_ERR_NOT_FOUND;
 688 }
 689 
 690 static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl)
 691 {
 692     mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 693     mca_bml_base_btl_t* bml_btl;
 694     mca_btl_base_module_t* ep_btl;
 695     double total_bandwidth = 0;
 696     size_t b;
 697 
 698     if(NULL == ep)
 699         return OMPI_SUCCESS;
 700 
 701     /* remove btl from eager list */
 702     mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
 703 
 704     /* remove btl from send list */
 705     if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) {
 706 
 707         /* compute total_bandwidth and
 708            reset max_send_size to the min of all btl's */
 709         total_bandwidth = 0;
 710         ep->btl_max_send_size = -1;
 711         for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
 712             bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
 713             ep_btl = bml_btl->btl;
 714 
 715             total_bandwidth += ep_btl->btl_bandwidth;
 716             if (ep->btl_max_send_size > ep_btl->btl_max_send_size) {
 717                 ep->btl_max_send_size = ep_btl->btl_max_send_size;
 718             }
 719         }
 720 
 721         /* compute weighting factor for this btl */
 722         for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
 723             bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
 724             ep_btl = bml_btl->btl;
 725 
 726             if(ep_btl->btl_bandwidth > 0) {
 727                 bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth);
 728             } else {
 729                 bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send));
 730             }
 731         }
 732     }
 733 
 734     /* remove btl from RDMA list */
 735     if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) {
 736 
 737         /* compute total bandwidth */
 738         total_bandwidth = 0;
 739         ep->btl_pipeline_send_length = 0;
 740         ep->btl_send_limit = 0;
 741         for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
 742             bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
 743             ep_btl = bml_btl->btl;
 744 
 745             /* update aggregate endpoint info */
 746             total_bandwidth += ep_btl->btl_bandwidth;
 747             if (ep->btl_pipeline_send_length < ep_btl->btl_rdma_pipeline_send_length) {
 748                 ep->btl_pipeline_send_length = ep_btl->btl_rdma_pipeline_send_length;
 749             }
 750             if (ep->btl_send_limit < ep_btl->btl_min_rdma_pipeline_size) {
 751                 ep->btl_send_limit = ep_btl->btl_min_rdma_pipeline_size;
 752             }
 753         }
 754 
 755         /* compute weighting factor for this btl */
 756         for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
 757             bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
 758             ep_btl = bml_btl->btl;
 759 
 760             if(ep_btl->btl_bandwidth > 0) {
 761                 bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth);
 762             } else {
 763                 bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma));
 764             }
 765         }
 766     }
 767 
 768     return OMPI_SUCCESS;
 769 }
 770 
 771 int mca_bml_r2_finalize( void )
 772 {
 773     ompi_proc_t** procs;
 774     size_t p, num_procs;
 775     opal_list_item_t* w_item;
 776 
 777     if (NULL != btl_names) {
 778         free(btl_names);
 779         btl_names = NULL;
 780     }
 781 
 782     /* Similar to mca_bml_r2_del_btl ... */
 783     procs = ompi_proc_all(&num_procs);
 784     if(NULL == procs)
 785         goto CLEANUP;
 786 
 787     for (w_item =  opal_list_get_first(&mca_btl_base_modules_initialized);
 788          w_item != opal_list_get_end(&mca_btl_base_modules_initialized);
 789          w_item =  opal_list_get_next(w_item)) {
 790         mca_btl_base_selected_module_t *sm = (mca_btl_base_selected_module_t *) w_item;
 791         mca_btl_base_module_t* btl = sm->btl_module;
 792 
 793         /* unregister the BTL progress function if any */
 794         bml_r2_remove_btl_progress(btl);
 795 
 796         /* dont use this btl for any peers */
 797         for( p = 0; p < num_procs; p++ ) {
 798             ompi_proc_t* proc = procs[p];
 799             mca_bml_r2_del_proc_btl(proc, sm->btl_module);
 800         }
 801     }
 802     /* Release the procs as the ompi_proc_all increase their ref_count */
 803     for( p = 0; p < num_procs; p++ ) {
 804         OBJ_RELEASE(procs[p]);
 805     }
 806     free(procs);
 807 
 808  CLEANUP:
 809     mca_bml_r2.num_btl_modules = 0;
 810     mca_bml_r2.num_btl_progress = 0;
 811 
 812     if( NULL != mca_bml_r2.btl_modules) {
 813         free(mca_bml_r2.btl_modules);
 814         mca_bml_r2.btl_modules = NULL;
 815     }
 816     if( NULL != mca_bml_r2.btl_progress ) {
 817         free(mca_bml_r2.btl_progress);
 818         mca_bml_r2.btl_progress = NULL;
 819     }
 820 
 821     /* Do not close the BTL base here; the BML upper layer will take
 822        care of that. */
 823 
 824     return OMPI_SUCCESS;
 825 }
 826 
 827 
 828 /*
 829  *  (1) Remove btl from each bml endpoint
 830  *  (2) Remove btl from the global list
 831  */
 832 
 833 static int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
 834 {
 835     ompi_proc_t** procs;
 836     size_t i, m, p, num_procs;
 837     opal_list_item_t* item;
 838     mca_btl_base_module_t** modules;
 839     bool found = false;
 840 
 841     if(opal_list_get_size(&mca_btl_base_modules_initialized) == 2) {
 842         opal_output(0, "only one BTL left, can't failover");
 843         return OMPI_SUCCESS;
 844     }
 845 
 846     procs = ompi_proc_all(&num_procs);
 847     if(NULL == procs)
 848         return OMPI_SUCCESS;
 849 
 850     /* Get rid of the associated progress function */
 851     bml_r2_remove_btl_progress(btl);
 852 
 853     /* dont use this btl for any peers */
 854     for( p = 0; p < num_procs; p++ ) {
 855         ompi_proc_t* proc = procs[p];
 856         mca_bml_r2_del_proc_btl(proc, btl);
 857     }
 858 
 859     /* remove from the btl list */
 860     for (item =  opal_list_get_first(&mca_btl_base_modules_initialized);
 861          item != opal_list_get_end(&mca_btl_base_modules_initialized);
 862          item =  opal_list_get_next(item)) {
 863         mca_btl_base_selected_module_t *sm = (mca_btl_base_selected_module_t *) item;
 864         if(sm->btl_module == btl) {
 865             opal_list_remove_item(&mca_btl_base_modules_initialized, item);
 866             free(sm);
 867             found = true;
 868             break;
 869         }
 870     }
 871     if(!found) {
 872         /* doesn't even exist */
 873         goto CLEANUP;
 874     }
 875     /* remove from bml list */
 876     modules = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*) * (mca_bml_r2.num_btl_modules-1));
 877     for(i=0,m=0; i<mca_bml_r2.num_btl_modules; i++) {
 878         if(mca_bml_r2.btl_modules[i] != btl) {
 879             modules[m++] = mca_bml_r2.btl_modules[i];
 880         }
 881     }
 882     free(mca_bml_r2.btl_modules);
 883     mca_bml_r2.btl_modules = modules;
 884     mca_bml_r2.num_btl_modules = m;
 885 
 886     /* cleanup */
 887     btl->btl_finalize(btl);
 888 CLEANUP:
 889     /* Decrease the ref_count increased by the call to ompi_proc_all */
 890     for( p = 0; p < num_procs; p++ ) {
 891         OBJ_RELEASE(procs[p]);
 892     }
 893     free(procs);
 894     return OMPI_SUCCESS;
 895 }
 896 
 897 static int mca_bml_r2_add_btl(mca_btl_base_module_t* btl)
 898 {
 899     return OMPI_ERR_NOT_IMPLEMENTED;
 900 }
 901 
 902 
 903 /*
 904  *  Register callback w/ all active btls
 905  */
 906 static int mca_bml_r2_register( mca_btl_base_tag_t tag,
 907                                 mca_btl_base_module_recv_cb_fn_t cbfunc,
 908                                 void* data )
 909 {
 910     mca_btl_base_active_message_trigger[tag].cbfunc = cbfunc;
 911     mca_btl_base_active_message_trigger[tag].cbdata = data;
 912     /* Give an oportunity to the BTLs to do something special
 913      * for each registration.
 914      */
 915     {
 916         int i, rc;
 917         mca_btl_base_module_t *btl;
 918 
 919         for(i = 0; i < (int)mca_bml_r2.num_btl_modules; i++) {
 920             btl = mca_bml_r2.btl_modules[i];
 921             if( NULL == btl->btl_register )
 922                 continue;
 923             rc = btl->btl_register(btl, tag, cbfunc, data);
 924             if(OMPI_SUCCESS != rc) {
 925                 return rc;
 926             }
 927         }
 928     }
 929 
 930     return OMPI_SUCCESS;
 931 }
 932 
 933 
 934 /*
 935  *  Register an error handler with/ all active btls
 936  *   if they support error handlers..
 937  */
 938 
 939 static int mca_bml_r2_register_error( mca_btl_base_module_error_cb_fn_t  cbfunc)
 940 {
 941     uint32_t  i;
 942     int rc;
 943     mca_btl_base_module_t *btl;
 944     uint32_t ver;
 945 
 946     for(i = 0; i < mca_bml_r2.num_btl_modules; i++) {
 947         btl = mca_bml_r2.btl_modules[i];
 948         /* this wont work for version numbers greater than 256... seems
 949            reasonable.. */
 950         ver = btl->btl_component->btl_version.mca_type_major_version << 16 |
 951             btl->btl_component->btl_version.mca_type_minor_version << 8 |
 952             btl->btl_component->btl_version.mca_type_release_version;
 953         /* is version number greater than or equal to 1.0.1? */
 954         if(ver >= ((1 << 16) |  (0 << 8) | 1) &&
 955            NULL != btl->btl_register_error) {
 956             rc = btl->btl_register_error(btl, cbfunc);
 957             if(OMPI_SUCCESS != rc) {
 958                 return rc;
 959             }
 960         }
 961     }
 962     return OMPI_SUCCESS;
 963 }
 964 
 965 
 966 int mca_bml_r2_component_fini(void)
 967 {
 968     return OMPI_SUCCESS;
 969 }
 970 
 971 mca_bml_r2_module_t mca_bml_r2 = {
 972     .super = {
 973         .bml_component = &mca_bml_r2_component,
 974         .bml_add_proc = mca_bml_r2_add_proc,
 975         .bml_add_procs = mca_bml_r2_add_procs,
 976         .bml_del_procs = mca_bml_r2_del_procs,
 977         .bml_add_btl = mca_bml_r2_add_btl,
 978         .bml_del_btl = mca_bml_r2_del_btl,
 979         .bml_del_proc_btl = mca_bml_r2_del_proc_btl,
 980         .bml_register = mca_bml_r2_register,
 981         .bml_register_error = mca_bml_r2_register_error,
 982         .bml_finalize = mca_bml_r2_finalize,
 983         .bml_ft_event = mca_bml_r2_ft_event,
 984     },
 985 };
 986 

/* [<][>][^][v][top][bottom][index][help] */