root/opal/mca/btl/tcp/btl_tcp_proc.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_btl_tcp_proc_construct
  2. mca_btl_tcp_proc_destruct
  3. mca_btl_tcp_proc_create
  4. evaluate_assignment
  5. visit
  6. mca_btl_tcp_initialise_interface
  7. mca_btl_tcp_retrieve_local_interfaces
  8. mca_btl_tcp_proc_insert
  9. mca_btl_tcp_proc_remove
  10. mca_btl_tcp_proc_lookup
  11. mca_btl_tcp_proc_accept
  12. mca_btl_tcp_proc_tosocks

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2017 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008-2010 Oracle and/or its affiliates.  All rights reserved
  14  * Copyright (c) 2013-2017 Intel, Inc.  All rights reserved.
  15  * Copyright (c) 2014-2016 Research Organization for Information Science
  16  *                         and Technology (RIST). All rights reserved.
  17  * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
  18  *                         reserved.
  19  * Copyright (c) 2015-2018 Cisco Systems, Inc.  All rights reserved
  20  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  21  * $COPYRIGHT$
  22  *
  23  * Additional copyrights may follow
  24  *
  25  * $HEADER$
  26  */
  27 
  28 #include "opal_config.h"
  29 
  30 #ifdef HAVE_NETINET_IN_H
  31 #include <netinet/in.h>
  32 #endif
  33 #ifdef HAVE_ARPA_INET_H
  34 #include <arpa/inet.h>
  35 #endif
  36 
  37 #include "opal/class/opal_hash_table.h"
  38 #include "opal/mca/btl/base/btl_base_error.h"
  39 #include "opal/mca/pmix/pmix.h"
  40 #include "opal/util/arch.h"
  41 #include "opal/util/argv.h"
  42 #include "opal/util/if.h"
  43 #include "opal/util/net.h"
  44 #include "opal/util/proc.h"
  45 #include "opal/util/show_help.h"
  46 #include "opal/util/printf.h"
  47 
  48 #include "btl_tcp.h"
  49 #include "btl_tcp_proc.h"
  50 
  51 static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc);
  52 static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc);
  53 
  54 struct mca_btl_tcp_proc_data_t {
  55     mca_btl_tcp_interface_t** local_interfaces;
  56     opal_hash_table_t local_kindex_to_index;
  57     size_t num_local_interfaces, max_local_interfaces;
  58     size_t num_peer_interfaces;
  59     opal_hash_table_t peer_kindex_to_index;
  60     unsigned int *best_assignment;
  61     int max_assignment_weight;
  62     int max_assignment_cardinality;
  63     enum mca_btl_tcp_connection_quality **weights;
  64     struct mca_btl_tcp_addr_t ***best_addr;
  65 };
  66 
  67 typedef struct mca_btl_tcp_proc_data_t mca_btl_tcp_proc_data_t;
  68 
  69 OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t,
  70                     opal_list_item_t,
  71                     mca_btl_tcp_proc_construct,
  72                     mca_btl_tcp_proc_destruct );
  73 
  74 void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* tcp_proc)
  75 {
  76     tcp_proc->proc_opal           = NULL;
  77     tcp_proc->proc_addrs          = NULL;
  78     tcp_proc->proc_addr_count     = 0;
  79     tcp_proc->proc_endpoints      = NULL;
  80     tcp_proc->proc_endpoint_count = 0;
  81     OBJ_CONSTRUCT(&tcp_proc->proc_lock, opal_mutex_t);
  82 }
  83 
  84 /*
  85  * Cleanup ib proc instance
  86  */
  87 
  88 void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc)
  89 {
  90     if( NULL != tcp_proc->proc_opal ) {
  91         /* remove from list of all proc instances */
  92         OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
  93         opal_proc_table_remove_value(&mca_btl_tcp_component.tcp_procs,
  94                                      tcp_proc->proc_opal->proc_name);
  95         OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
  96         OBJ_RELEASE(tcp_proc->proc_opal);
  97         tcp_proc->proc_opal = NULL;
  98     }
  99     /* release resources */
 100     if(NULL != tcp_proc->proc_endpoints) {
 101         free(tcp_proc->proc_endpoints);
 102     }
 103     if(NULL != tcp_proc->proc_addrs) {
 104         free(tcp_proc->proc_addrs);
 105     }
 106     OBJ_DESTRUCT(&tcp_proc->proc_lock);
 107 }
 108 
 109 /*
 110  * Create a TCP process structure. There is a one-to-one correspondence
 111  * between a opal_proc_t and a mca_btl_tcp_proc_t instance. We cache
 112  * additional data (specifically the list of mca_btl_tcp_endpoint_t instances,
 113  * and published addresses) associated w/ a given destination on this
 114  * datastructure.
 115  */
 116 
 117 mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
 118 {
 119     mca_btl_tcp_proc_t* btl_proc;
 120     int rc;
 121     mca_btl_tcp_modex_addr_t *remote_addrs = NULL;
 122     size_t i, size;
 123 
 124     OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
 125     rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
 126                                    proc->proc_name, (void**)&btl_proc);
 127     if (OPAL_SUCCESS == rc) {
 128         OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
 129         return btl_proc;
 130     }
 131 
 132     /* proc was not found, so create one */
 133     btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
 134     if (NULL == btl_proc) {
 135         rc = OPAL_ERR_OUT_OF_RESOURCE;
 136         goto cleanup;
 137     }
 138 
 139     /* Retain the proc, but don't store the ref into the btl_proc just yet. This
 140      * provides a way to release the btl_proc in case of failure without having to
 141      * unlock the mutex.
 142      */
 143     OBJ_RETAIN(proc);
 144 
 145     /* lookup tcp parameters exported by this proc */
 146     OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
 147                     &proc->proc_name, (uint8_t**)&remote_addrs, &size);
 148     if (OPAL_SUCCESS != rc) {
 149         if (OPAL_ERR_NOT_FOUND != rc) {
 150             BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
 151         }
 152         goto cleanup;
 153     }
 154 
 155     if (0 != (size % sizeof(mca_btl_tcp_modex_addr_t))) {
 156         BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
 157                    (unsigned long)size,
 158                    (unsigned long)sizeof(mca_btl_tcp_modex_addr_t)));
 159         rc = OPAL_ERROR;
 160         goto cleanup;
 161     }
 162 
 163     btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_modex_addr_t);
 164     btl_proc->proc_addrs = malloc(btl_proc->proc_addr_count *
 165                                   sizeof(mca_btl_tcp_addr_t));
 166     if (NULL == btl_proc->proc_addrs) {
 167         rc = OPAL_ERR_OUT_OF_RESOURCE;
 168         goto cleanup;
 169     }
 170 
 171     /* the modex and proc structures differ slightly, so copy the
 172        fields needed in the proc version */
 173     for (i = 0 ; i < btl_proc->proc_addr_count ; i++) {
 174         if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
 175             memcpy(&btl_proc->proc_addrs[i].addr_inet,
 176                    remote_addrs[i].addr, sizeof(struct in_addr));
 177             btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
 178             btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
 179             btl_proc->proc_addrs[i].addr_family = AF_INET;
 180             btl_proc->proc_addrs[i].addr_inuse = false;
 181         } else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) {
 182 #if OPAL_ENABLE_IPV6
 183             memcpy(&btl_proc->proc_addrs[i].addr_inet6,
 184                    remote_addrs[i].addr, sizeof(struct in6_addr));
 185             btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port;
 186             btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex;
 187             btl_proc->proc_addrs[i].addr_family = AF_INET6;
 188             btl_proc->proc_addrs[i].addr_inuse = false;
 189 #else
 190             rc = OPAL_ERR_NOT_SUPPORTED;
 191             goto cleanup;
 192 #endif
 193         } else {
 194             BTL_ERROR(("Unexpected address family %d",
 195                        (int)remote_addrs[i].addr_family));
 196             rc = OPAL_ERR_BAD_PARAM;
 197             goto cleanup;
 198         }
 199     }
 200 
 201     free(remote_addrs);
 202 
 203     /* allocate space for endpoint array - one for each exported address */
 204     btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
 205         malloc((1 + btl_proc->proc_addr_count) *
 206                sizeof(mca_btl_base_endpoint_t*));
 207     if (NULL == btl_proc->proc_endpoints) {
 208         rc = OPAL_ERR_OUT_OF_RESOURCE;
 209         goto cleanup;
 210     }
 211 
 212 cleanup:
 213     if (OPAL_SUCCESS == rc) {
 214         btl_proc->proc_opal = proc;  /* link with the proc */
 215         /* add to hash table of all proc instance. */
 216         opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
 217                                   proc->proc_name, btl_proc);
 218     } else {
 219         if (btl_proc) {
 220             OBJ_RELEASE(btl_proc);  /* release the local proc */
 221             OBJ_RELEASE(proc);      /* and the ref on the OMPI proc */
 222             btl_proc = NULL;
 223         }
 224     }
 225 
 226     OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
 227 
 228     return btl_proc;
 229 }
 230 
 231 
 232 
 233 static void evaluate_assignment(mca_btl_tcp_proc_data_t *proc_data, int *a) {
 234     size_t i;
 235     unsigned int max_interfaces = proc_data->num_local_interfaces;
 236     int assignment_weight = 0;
 237     int assignment_cardinality = 0;
 238 
 239     if(max_interfaces < proc_data->num_peer_interfaces) {
 240         max_interfaces = proc_data->num_peer_interfaces;
 241     }
 242 
 243     for(i = 0; i < max_interfaces; ++i) {
 244         if(0 < proc_data->weights[i][a[i]-1]) {
 245             ++assignment_cardinality;
 246             assignment_weight += proc_data->weights[i][a[i]-1];
 247         }
 248     }
 249 
 250     /*
 251      * check wether current solution beats all previous solutions
 252      */
 253     if(assignment_cardinality > proc_data->max_assignment_cardinality
 254             || (assignment_cardinality == proc_data->max_assignment_cardinality
 255                 && assignment_weight > proc_data->max_assignment_weight)) {
 256 
 257         for(i = 0; i < max_interfaces; ++i) {
 258              proc_data->best_assignment[i] = a[i]-1;
 259         }
 260         proc_data->max_assignment_weight = assignment_weight;
 261         proc_data->max_assignment_cardinality = assignment_cardinality;
 262     }
 263 }
 264 
 265 static void visit(mca_btl_tcp_proc_data_t *proc_data, int k, int level, int siz, int *a)
 266 {
 267     level = level+1; a[k] = level;
 268 
 269     if (level == siz) {
 270         evaluate_assignment(proc_data, a);
 271     } else {
 272         int i;
 273         for ( i = 0; i < siz; i++)
 274             if (a[i] == 0)
 275                 visit(proc_data, i, level, siz, a);
 276     }
 277 
 278     level = level-1; a[k] = 0;
 279 }
 280 
 281 
 282 static void mca_btl_tcp_initialise_interface(mca_btl_tcp_interface_t* tcp_interface,
 283         int ifk_index, int index)
 284 {
 285     tcp_interface->kernel_index = ifk_index;
 286     tcp_interface->peer_interface = -1;
 287     tcp_interface->ipv4_address = NULL;
 288     tcp_interface->ipv6_address =  NULL;
 289     tcp_interface->index = index;
 290     tcp_interface->inuse = 0;
 291 }
 292 
 293 static mca_btl_tcp_interface_t** mca_btl_tcp_retrieve_local_interfaces(mca_btl_tcp_proc_data_t *proc_data)
 294 {
 295     struct sockaddr_storage local_addr;
 296     char local_if_name[IF_NAMESIZE];
 297     char **include, **exclude, **argv;
 298     int idx;
 299     mca_btl_tcp_interface_t * local_interface;
 300 
 301     assert (NULL == proc_data->local_interfaces);
 302     if( NULL != proc_data->local_interfaces )
 303         return proc_data->local_interfaces;
 304 
 305     proc_data->max_local_interfaces = MAX_KERNEL_INTERFACES;
 306     proc_data->num_local_interfaces = 0;
 307     proc_data->local_interfaces = (mca_btl_tcp_interface_t**)calloc( proc_data->max_local_interfaces, sizeof(mca_btl_tcp_interface_t*) );
 308     if( NULL == proc_data->local_interfaces )
 309         return NULL;
 310 
 311     /* Collect up the list of included and excluded interfaces, if any */
 312     include = opal_argv_split(mca_btl_tcp_component.tcp_if_include,',');
 313     exclude = opal_argv_split(mca_btl_tcp_component.tcp_if_exclude,',');
 314 
 315     /*
 316      * identify all kernel interfaces and the associated addresses of
 317      * the local node
 318      */
 319     for( idx = opal_ifbegin(); idx >= 0; idx = opal_ifnext (idx) ) {
 320         int kindex;
 321         uint64_t index;
 322         bool skip = false;
 323 
 324         opal_ifindextoaddr (idx, (struct sockaddr*) &local_addr, sizeof (local_addr));
 325         opal_ifindextoname (idx, local_if_name, sizeof (local_if_name));
 326 
 327         /* If we were given a list of included interfaces, then check
 328          * to see if the current one is a member of this set.  If so,
 329          * drop down and complete processing.  If not, skip it and
 330          * continue on to the next one.  Note that providing an include
 331          * list will override providing an exclude list as the two are
 332          * mutually exclusive.  This matches how it works in
 333          * mca_btl_tcp_component_create_instances() which is the function
 334          * that exports the interfaces.  */
 335         if(NULL != include) {
 336             argv = include;
 337             skip = true;
 338             while(argv && *argv) {
 339                 /* When comparing included interfaces, we look for exact matches.
 340                    That is why we are using strcmp() here. */
 341                 if (0 == strcmp(*argv, local_if_name)) {
 342                     skip = false;
 343                     break;
 344                 }
 345                 argv++;
 346             }
 347         } else if (NULL != exclude) {
 348             /* If we were given a list of excluded interfaces, then check to see if the
 349              * current one is a member of this set.  If not, drop down and complete
 350              * processing.  If so, skip it and continue on to the next one. */
 351             argv = exclude;
 352             while(argv && *argv) {
 353                 /* When looking for interfaces to exclude, we only look at
 354                  * the number of characters equal to what the user provided.
 355                  * For example, excluding "lo" excludes "lo", "lo0" and
 356                  * anything that starts with "lo" */
 357                 if(0 == strncmp(*argv, local_if_name, strlen(*argv))) {
 358                     skip = true;
 359                     break;
 360                 }
 361                 argv++;
 362             }
 363         }
 364         if (true == skip) {
 365             /* This interface is not part of the requested set, so skip it */
 366             continue;
 367         }
 368 
 369         kindex = opal_ifindextokindex(idx);
 370         int rc = opal_hash_table_get_value_uint32(&proc_data->local_kindex_to_index, kindex, (void**) &index);
 371 
 372         /* create entry for this kernel index previously not seen */
 373         if (OPAL_SUCCESS != rc) {
 374             index = proc_data->num_local_interfaces++;
 375             opal_hash_table_set_value_uint32(&proc_data->local_kindex_to_index, kindex, (void*)(uintptr_t) index);
 376 
 377             if( proc_data->num_local_interfaces == proc_data->max_local_interfaces ) {
 378                 proc_data->max_local_interfaces <<= 1;
 379                 proc_data->local_interfaces = (mca_btl_tcp_interface_t**)realloc( proc_data->local_interfaces,
 380                                                                                   proc_data->max_local_interfaces * sizeof(mca_btl_tcp_interface_t*) );
 381                 if( NULL == proc_data->local_interfaces )
 382                     goto cleanup;
 383             }
 384             proc_data->local_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
 385             assert(NULL != proc_data->local_interfaces[index]);
 386             mca_btl_tcp_initialise_interface(proc_data->local_interfaces[index], kindex, index);
 387         }
 388 
 389         local_interface = proc_data->local_interfaces[index];
 390         switch(local_addr.ss_family) {
 391         case AF_INET:
 392             /* if AF is disabled, skip it completely */
 393             if (4 == mca_btl_tcp_component.tcp_disable_family) {
 394                 continue;
 395             }
 396 
 397             local_interface->ipv4_address =
 398                 (struct sockaddr_storage*) malloc(sizeof(local_addr));
 399             memcpy(local_interface->ipv4_address,
 400                    &local_addr, sizeof(local_addr));
 401             opal_ifindextomask(idx,
 402                                &local_interface->ipv4_netmask,
 403                                sizeof(int));
 404             break;
 405         case AF_INET6:
 406             /* if AF is disabled, skip it completely */
 407             if (6 == mca_btl_tcp_component.tcp_disable_family) {
 408                 continue;
 409             }
 410 
 411             local_interface->ipv6_address
 412                 = (struct sockaddr_storage*) malloc(sizeof(local_addr));
 413             memcpy(local_interface->ipv6_address,
 414                    &local_addr, sizeof(local_addr));
 415             opal_ifindextomask(idx,
 416                                &local_interface->ipv6_netmask,
 417                                sizeof(int));
 418             break;
 419         default:
 420             opal_output(0, "unknown address family for tcp: %d\n",
 421                         local_addr.ss_family);
 422         }
 423     }
 424 cleanup:
 425     if (NULL != include) {
 426         opal_argv_free(include);
 427     }
 428     if (NULL != exclude) {
 429         opal_argv_free(exclude);
 430     }
 431 
 432     return proc_data->local_interfaces;
 433 }
 434 /*
 435  * Note that this routine must be called with the lock on the process
 436  * already held.  Insert a btl instance into the proc array and assign
 437  * it an address.
 438  */
 439 int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
 440                              mca_btl_base_endpoint_t* btl_endpoint )
 441 {
 442     struct sockaddr_storage endpoint_addr_ss;
 443     const char *proc_hostname;
 444     unsigned int perm_size = 0;
 445     int rc, *a = NULL;
 446     size_t i, j;
 447     mca_btl_tcp_interface_t** peer_interfaces = NULL;
 448     mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
 449     size_t max_peer_interfaces;
 450     char str_local[128], str_remote[128];
 451 
 452     if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
 453         return OPAL_ERR_UNREACH;
 454     }
 455 
 456     memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));
 457     OBJ_CONSTRUCT(&_proc_data.local_kindex_to_index, opal_hash_table_t);
 458     opal_hash_table_init(&_proc_data.local_kindex_to_index, 8);
 459     OBJ_CONSTRUCT(&_proc_data.peer_kindex_to_index, opal_hash_table_t);
 460     opal_hash_table_init(&_proc_data.peer_kindex_to_index, 8);
 461 
 462 #ifndef WORDS_BIGENDIAN
 463     /* if we are little endian and our peer is not so lucky, then we
 464        need to put all information sent to him in big endian (aka
 465        Network Byte Order) and expect all information received to
 466        be in NBO.  Since big endian machines always send and receive
 467        in NBO, we don't care so much about that case. */
 468     if (btl_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) {
 469         btl_endpoint->endpoint_nbo = true;
 470     }
 471 #endif
 472 
 473     /* insert into endpoint array */
 474     btl_endpoint->endpoint_proc = btl_proc;
 475     btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;
 476 
 477     /* sanity checks */
 478     if( NULL == mca_btl_tcp_retrieve_local_interfaces(proc_data) )
 479         return OPAL_ERR_OUT_OF_RESOURCE;
 480     if( 0 == proc_data->num_local_interfaces ) {
 481         return OPAL_ERR_UNREACH;
 482     }
 483 
 484     max_peer_interfaces = proc_data->max_local_interfaces;
 485     peer_interfaces = (mca_btl_tcp_interface_t**)calloc( max_peer_interfaces, sizeof(mca_btl_tcp_interface_t*) );
 486     if (NULL == peer_interfaces) {
 487         max_peer_interfaces = 0;
 488         rc = OPAL_ERR_OUT_OF_RESOURCE;
 489         goto exit;
 490     }
 491     proc_data->num_peer_interfaces = 0;
 492 
 493     /*
 494      * identify all kernel interfaces and the associated addresses of
 495      * the peer
 496      */
 497 
 498     for( i = 0; i < btl_proc->proc_addr_count; i++ ) {
 499 
 500         uint64_t index;
 501 
 502         mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i;
 503 
 504         mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss);
 505 
 506         rc = opal_hash_table_get_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void**) &index);
 507 
 508         if (OPAL_SUCCESS != rc) {
 509             index = proc_data->num_peer_interfaces++;
 510             opal_hash_table_set_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void*)(uintptr_t) index);
 511             if( proc_data->num_peer_interfaces == max_peer_interfaces ) {
 512                 max_peer_interfaces <<= 1;
 513                 peer_interfaces = (mca_btl_tcp_interface_t**)realloc( peer_interfaces,
 514                                                                       max_peer_interfaces * sizeof(mca_btl_tcp_interface_t*) );
 515                 if( NULL == peer_interfaces ) {
 516                     return OPAL_ERR_OUT_OF_RESOURCE;
 517                 }
 518             }
 519             peer_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
 520             mca_btl_tcp_initialise_interface(peer_interfaces[index],
 521                                              endpoint_addr->addr_ifkindex, index);
 522         }
 523 
 524         /*
 525          * in case the peer address has created all intended connections,
 526          * mark the complete peer interface as 'not available'
 527          */
 528         if(endpoint_addr->addr_inuse >=  mca_btl_tcp_component.tcp_num_links) {
 529             peer_interfaces[index]->inuse = 1;
 530         }
 531 
 532         switch(endpoint_addr_ss.ss_family) {
 533         case AF_INET:
 534             peer_interfaces[index]->ipv4_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
 535             peer_interfaces[index]->ipv4_endpoint_addr = endpoint_addr;
 536             memcpy(peer_interfaces[index]->ipv4_address,
 537                    &endpoint_addr_ss, sizeof(endpoint_addr_ss));
 538             break;
 539         case AF_INET6:
 540             peer_interfaces[index]->ipv6_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
 541             peer_interfaces[index]->ipv6_endpoint_addr = endpoint_addr;
 542             memcpy(peer_interfaces[index]->ipv6_address,
 543                    &endpoint_addr_ss, sizeof(endpoint_addr_ss));
 544             break;
 545         default:
 546             opal_output(0, "unknown address family for tcp: %d\n",
 547                         endpoint_addr_ss.ss_family);
 548             return OPAL_ERR_UNREACH;
 549         }
 550     }
 551 
 552     /*
 553      * assign weights to each possible pair of interfaces
 554      */
 555 
 556     perm_size = proc_data->num_local_interfaces;
 557     if(proc_data->num_peer_interfaces > perm_size) {
 558         perm_size = proc_data->num_peer_interfaces;
 559     }
 560 
 561     proc_data->weights = (enum mca_btl_tcp_connection_quality**) malloc(perm_size
 562                                                              * sizeof(enum mca_btl_tcp_connection_quality*));
 563     assert(NULL != proc_data->weights);
 564 
 565     proc_data->best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size
 566                                                 * sizeof(mca_btl_tcp_addr_t **));
 567     assert(NULL != proc_data->best_addr);
 568     for(i = 0; i < perm_size; ++i) {
 569         proc_data->weights[i] = (enum mca_btl_tcp_connection_quality*) calloc(perm_size,
 570                                                                    sizeof(enum mca_btl_tcp_connection_quality));
 571         assert(NULL != proc_data->weights[i]);
 572 
 573         proc_data->best_addr[i] = (mca_btl_tcp_addr_t **) calloc(perm_size,
 574                                                       sizeof(mca_btl_tcp_addr_t *));
 575         assert(NULL != proc_data->best_addr[i]);
 576     }
 577 
 578 
 579     for( i = 0; i < proc_data->num_local_interfaces; ++i ) {
 580         mca_btl_tcp_interface_t* local_interface = proc_data->local_interfaces[i];
 581         for( j = 0; j < proc_data->num_peer_interfaces; ++j ) {
 582 
 583             /*  initially, assume no connection is possible */
 584             proc_data->weights[i][j] = CQ_NO_CONNECTION;
 585 
 586             /* check state of ipv4 address pair */
 587             if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
 588                NULL != peer_interfaces[j]->ipv4_address) {
 589 
 590                 /* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */
 591                 inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr,
 592                           str_local, sizeof(str_local));
 593                 inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr,
 594                           str_remote, sizeof(str_remote));
 595 
 596                 if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
 597                    opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
 598                     if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
 599                                             (struct sockaddr*) peer_interfaces[j]->ipv4_address,
 600                                             local_interface->ipv4_netmask)) {
 601                         proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
 602                         opal_output_verbose(20, opal_btl_base_framework.framework_output,
 603                                             "btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK",
 604                                             str_local, str_remote);
 605                     } else {
 606                         proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
 607                         opal_output_verbose(20, opal_btl_base_framework.framework_output,
 608                                             "btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK",
 609                                             str_local, str_remote);
 610                     }
 611                     proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
 612                     continue;
 613                 }
 614                 if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
 615                                         (struct sockaddr*) peer_interfaces[j]->ipv4_address,
 616                                         local_interface->ipv4_netmask)) {
 617                     proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
 618                     opal_output_verbose(20, opal_btl_base_framework.framework_output,
 619                                        "btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK",
 620                                        str_local, str_remote);
 621                 } else {
 622                     proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
 623                     opal_output_verbose(20, opal_btl_base_framework.framework_output,
 624                                        "btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK",
 625                                        str_local, str_remote);
 626                 }
 627                 proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
 628                 continue;
 629             }
 630 
 631             /* check state of ipv6 address pair - ipv6 is always public,
 632              * since link-local addresses are skipped in opal_ifinit()
 633              */
 634             if(NULL != local_interface->ipv6_address &&
 635                NULL != peer_interfaces[j]->ipv6_address) {
 636 
 637                 /* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */
 638                 inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr,
 639                           str_local, sizeof(str_local));
 640                 inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr,
 641                           str_remote, sizeof(str_remote));
 642 
 643                 if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
 644                                          (struct sockaddr*) peer_interfaces[j]->ipv6_address,
 645                                          local_interface->ipv6_netmask)) {
 646                     proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
 647                     opal_output_verbose(20, opal_btl_base_framework.framework_output,
 648                                        "btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK",
 649                                        str_local, str_remote);
 650                 } else {
 651                     proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
 652                     opal_output_verbose(20, opal_btl_base_framework.framework_output,
 653                                        "btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK",
 654                                        str_local, str_remote);
 655                 }
 656                 proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
 657                 continue;
 658             }
 659 
 660         } /* for each peer interface */
 661     } /* for each local interface */
 662 
 663     /*
 664      * determine the size of the set to permute (max number of
 665      * interfaces
 666      */
 667 
 668     proc_data->best_assignment = (unsigned int *) malloc (perm_size * sizeof(int));
 669 
 670     a = (int *) malloc(perm_size * sizeof(int));
 671     if (NULL == a) {
 672         rc = OPAL_ERR_OUT_OF_RESOURCE;
 673         goto exit;
 674     }
 675 
 676     /* Can only find the best set of connections when the number of
 677      * interfaces is not too big.  When it gets larger, we fall back
 678      * to a simpler and faster (and not as optimal) algorithm.
 679      * See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031
 680      * for more details about this issue.  */
 681     if (perm_size <= MAX_PERMUTATION_INTERFACES) {
 682         memset(a, 0, perm_size * sizeof(int));
 683         proc_data->max_assignment_cardinality = -1;
 684         proc_data->max_assignment_weight = -1;
 685         visit(proc_data, 0, -1, perm_size, a);
 686 
 687         rc = OPAL_ERR_UNREACH;
 688         for(i = 0; i < perm_size; ++i) {
 689             unsigned int best = proc_data->best_assignment[i];
 690             if(best > proc_data->num_peer_interfaces
 691                || proc_data->weights[i][best] == CQ_NO_CONNECTION
 692                || peer_interfaces[best]->inuse
 693                || NULL == peer_interfaces[best]) {
 694                 continue;
 695             }
 696             peer_interfaces[best]->inuse++;
 697             btl_endpoint->endpoint_addr = proc_data->best_addr[i][best];
 698             btl_endpoint->endpoint_addr->addr_inuse = true;
 699             rc = OPAL_SUCCESS;
 700             break;
 701         }
 702     } else {
 703         enum mca_btl_tcp_connection_quality max;
 704         int i_max = 0, j_max = 0;
 705         /* Find the best connection that is not in use.  Save away
 706          * the indices of the best location. */
 707         max = CQ_NO_CONNECTION;
 708         for(i=0; i<proc_data->num_local_interfaces; ++i) {
 709             for(j=0; j<proc_data->num_peer_interfaces; ++j) {
 710                 if (!peer_interfaces[j]->inuse) {
 711                     if (proc_data->weights[i][j] > max) {
 712                         max = proc_data->weights[i][j];
 713                         i_max = i;
 714                         j_max = j;
 715                     }
 716                 }
 717             }
 718         }
 719         /* Now see if there is a some type of connection available. */
 720         rc = OPAL_ERR_UNREACH;
 721         if (CQ_NO_CONNECTION != max) {
 722             peer_interfaces[j_max]->inuse++;
 723             btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max];
 724             btl_endpoint->endpoint_addr->addr_inuse = true;
 725             rc = OPAL_SUCCESS;
 726         }
 727     }
 728     if (OPAL_ERR_UNREACH == rc) {
 729         opal_output_verbose(10, opal_btl_base_framework.framework_output,
 730                             "btl:tcp: host %s, process %s UNREACHABLE",
 731                             proc_hostname,
 732                             OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
 733     }
 734 
 735  exit:
 736     // Ok to always free because proc_data() was memset() to 0 before
 737     // any possible return (and free(NULL) is fine).
 738     for(i = 0; i < perm_size; ++i) {
 739         free(proc_data->weights[i]);
 740         free(proc_data->best_addr[i]);
 741     }
 742 
 743     for(i = 0; i < proc_data->num_peer_interfaces; ++i) {
 744         if(NULL != peer_interfaces[i]->ipv4_address) {
 745             free(peer_interfaces[i]->ipv4_address);
 746         }
 747         if(NULL != peer_interfaces[i]->ipv6_address) {
 748             free(peer_interfaces[i]->ipv6_address);
 749         }
 750         free(peer_interfaces[i]);
 751     }
 752     free(peer_interfaces);
 753 
 754     for(i = 0; i < proc_data->num_local_interfaces; ++i) {
 755         if(NULL != proc_data->local_interfaces[i]->ipv4_address) {
 756             free(proc_data->local_interfaces[i]->ipv4_address);
 757         }
 758         if(NULL != proc_data->local_interfaces[i]->ipv6_address) {
 759             free(proc_data->local_interfaces[i]->ipv6_address);
 760         }
 761         free(proc_data->local_interfaces[i]);
 762     }
 763     free(proc_data->local_interfaces); proc_data->local_interfaces = NULL;
 764     proc_data->max_local_interfaces = 0;
 765 
 766     free(proc_data->weights); proc_data->weights = NULL;
 767     free(proc_data->best_addr); proc_data->best_addr = NULL;
 768     free(proc_data->best_assignment); proc_data->best_assignment = NULL;
 769 
 770     OBJ_DESTRUCT(&_proc_data.local_kindex_to_index);
 771     OBJ_DESTRUCT(&_proc_data.peer_kindex_to_index);
 772 
 773     free(a);
 774 
 775     return rc;
 776 }
 777 
 778 /*
 779  * Remove an endpoint from the proc array and indicate the address is
 780  * no longer in use.
 781  */
 782 
 783 int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint)
 784 {
 785     size_t i;
 786     if (NULL != btl_proc) {
 787         OPAL_THREAD_LOCK(&btl_proc->proc_lock);
 788         for(i = 0; i < btl_proc->proc_endpoint_count; i++) {
 789             if(btl_proc->proc_endpoints[i] == btl_endpoint) {
 790                 memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1,
 791                         (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*));
 792                 if(--btl_proc->proc_endpoint_count == 0) {
 793                     OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
 794                     OBJ_RELEASE(btl_proc);
 795                     return OPAL_SUCCESS;
 796                 }
 797                 /* The endpoint_addr may still be NULL if this endpoint is
 798                    being removed early in the wireup sequence (e.g., if it
 799                    is unreachable by all other procs) */
 800                 if (NULL != btl_endpoint->endpoint_addr) {
 801                     btl_endpoint->endpoint_addr->addr_inuse = false;
 802                 }
 803                 break;
 804             }
 805         }
 806         OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
 807     }
 808     return OPAL_SUCCESS;
 809 }
 810 
 811 /*
 812  * Look for an existing TCP process instance based on the globally unique
 813  * process identifier.
 814  */
 815 mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
 816 {
 817     mca_btl_tcp_proc_t* proc = NULL;
 818 
 819     OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
 820     opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
 821                               *name, (void**)&proc);
 822     OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
 823     if (OPAL_UNLIKELY(NULL == proc)) {
 824         mca_btl_base_endpoint_t *endpoint;
 825         opal_proc_t *opal_proc;
 826 
 827         BTL_VERBOSE(("adding tcp proc for unknown peer {%s}",
 828                      OPAL_NAME_PRINT(*name)));
 829 
 830         opal_proc = opal_proc_for_name (*name);
 831         if (NULL == opal_proc) {
 832             return NULL;
 833         }
 834 
 835         /* try adding this proc to each btl until */
 836         for( uint32_t i = 0; i < mca_btl_tcp_component.tcp_num_btls; ++i ) {
 837             endpoint = NULL;
 838             (void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc,
 839                                           &endpoint, NULL);
 840             if (NULL != endpoint && NULL == proc) {
 841                 /* construct all the endpoints and get the proc */
 842                 proc = endpoint->endpoint_proc;
 843             }
 844         }
 845     }
 846 
 847     return proc;
 848 }
 849 
 850 /*
 851  * loop through all available BTLs for one matching the source address
 852  * of the request.
 853  */
 854 void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr, int sd)
 855 {
 856     OPAL_THREAD_LOCK(&btl_proc->proc_lock);
 857     int found_match = 0;
 858     mca_btl_base_endpoint_t* match_btl_endpoint;
 859 
 860     for( size_t i = 0; i < btl_proc->proc_endpoint_count; i++ ) {
 861         mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
 862         /* We are not here to make a decision about what is good socket
 863          * and what is not. We simply check that this socket fit the endpoint
 864          * end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
 865         if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
 866             continue;
 867         }
 868         switch (addr->sa_family) {
 869         case AF_INET:
 870             if( memcmp( &btl_endpoint->endpoint_addr->addr_inet,
 871                         &(((struct sockaddr_in*)addr)->sin_addr),
 872                         sizeof(struct in_addr) ) ) {
 873                 char tmp[2][16];
 874                 opal_output_verbose(20, opal_btl_base_framework.framework_output,
 875                                     "btl: tcp: Match incoming connection from %s %s with locally known IP %s failed (iface %d/%d)!\n",
 876                                     OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
 877                                     inet_ntop(AF_INET, (void*)&((struct sockaddr_in*)addr)->sin_addr,
 878                                               tmp[0], 16),
 879                                     inet_ntop(AF_INET, (void*)(struct in_addr*)&btl_endpoint->endpoint_addr->addr_inet,
 880                                               tmp[1], 16),
 881                                     (int)i, (int)btl_proc->proc_endpoint_count);
 882                 continue;
 883             } else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
 884                  found_match = 1;
 885                  match_btl_endpoint = btl_endpoint;
 886                  continue;
 887             }
 888             break;
 889 #if OPAL_ENABLE_IPV6
 890         case AF_INET6:
 891             if( memcmp( &btl_endpoint->endpoint_addr->addr_inet,
 892                         &(((struct sockaddr_in6*)addr)->sin6_addr),
 893                         sizeof(struct in6_addr) ) ) {
 894                 char tmp[2][INET6_ADDRSTRLEN];
 895                 opal_output_verbose(20, opal_btl_base_framework.framework_output,
 896                                     "btl: tcp: Match incoming connection from %s %s with locally known IP %s failed (iface %d/%d)!\n",
 897                                     OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
 898                                     inet_ntop(AF_INET6, (void*)&((struct sockaddr_in6*)addr)->sin6_addr,
 899                                               tmp[0], INET6_ADDRSTRLEN),
 900                                     inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet,
 901                                               tmp[1], INET6_ADDRSTRLEN),
 902                                     (int)i, (int)btl_proc->proc_endpoint_count);
 903                 continue;
 904             } else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
 905                  found_match = 1;
 906                  match_btl_endpoint = btl_endpoint;
 907                  continue;
 908             }
 909             break;
 910 #endif
 911         default:
 912             ;
 913         }
 914 
 915         /* Set state to CONNECTING to ensure that subsequent conenctions do not attempt to re-use endpoint in the num_links > 1 case*/
 916         btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
 917         (void)mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd);
 918         OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
 919         return;
 920     }
 921     /* In this case the connection was inbound to an address exported, but was not in a CLOSED state.
 922      * mca_btl_tcp_endpoint_accept() has logic to deal with the race condition that has likely caused this
 923      * scenario, so call it here.*/
 924     if (found_match) {
 925         (void)mca_btl_tcp_endpoint_accept(match_btl_endpoint, addr, sd);
 926         OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
 927         return;
 928     }
 929     /* No further use of this socket. Close it */
 930     CLOSE_THE_SOCKET(sd);
 931     {
 932         char *addr_str = NULL, *tmp;
 933         char ip[128];
 934         ip[sizeof(ip) - 1] = '\0';
 935 
 936         for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) {
 937             mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
 938             if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
 939                 continue;
 940             }
 941             inet_ntop(btl_endpoint->endpoint_addr->addr_family,
 942                       (void*) &(btl_endpoint->endpoint_addr->addr_inet),
 943                       ip, sizeof(ip) - 1);
 944             if (NULL == addr_str) {
 945                 opal_asprintf(&tmp, "\n\t%s", ip);
 946             } else {
 947                 opal_asprintf(&tmp, "%s\n\t%s", addr_str, ip);
 948                 free(addr_str);
 949             }
 950             addr_str = tmp;
 951         }
 952         opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
 953                        true, opal_process_info.nodename,
 954                        getpid(),
 955                        btl_proc->proc_opal->proc_hostname,
 956                        OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
 957                        opal_net_get_hostname((struct sockaddr*)addr),
 958                        btl_proc->proc_endpoint_count,
 959                        (NULL == addr_str) ? "NONE" : addr_str);
 960         if (NULL != addr_str) {
 961             free(addr_str);
 962         }
 963     }
 964     OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
 965 }
 966 
 967 /*
 968  * convert internal data structure (mca_btl_tcp_addr_t) to sockaddr_storage
 969  *
 970  */
 971 bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t* proc_addr,
 972                               struct sockaddr_storage* output)
 973 {
 974     memset(output, 0, sizeof (*output));
 975     switch (proc_addr->addr_family) {
 976     case AF_INET:
 977         output->ss_family = AF_INET;
 978         memcpy(&((struct sockaddr_in*)output)->sin_addr,
 979                &proc_addr->addr_inet, sizeof(struct in_addr));
 980         ((struct sockaddr_in*)output)->sin_port = proc_addr->addr_port;
 981         break;
 982 #if OPAL_ENABLE_IPV6
 983     case AF_INET6:
 984         {
 985             struct sockaddr_in6* inaddr = (struct sockaddr_in6*)output;
 986             output->ss_family = AF_INET6;
 987             memcpy(&inaddr->sin6_addr, &proc_addr->addr_inet,
 988                    sizeof (proc_addr->addr_inet));
 989             inaddr->sin6_port = proc_addr->addr_port;
 990             inaddr->sin6_scope_id = 0;
 991             inaddr->sin6_flowinfo = 0;
 992         }
 993         break;
 994 #endif
 995     default:
 996         opal_output( 0, "mca_btl_tcp_proc: unknown af_family received: %d\n",
 997                      proc_addr->addr_family );
 998         return false;
 999     }
1000     return true;
1001 }
1002 

/* [<][>][^][v][top][bottom][index][help] */