root/opal/mca/btl/usnic/btl_usnic_cclient.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. opal_btl_usnic_connectivity_client_init
  2. opal_btl_usnic_connectivity_listen
  3. opal_btl_usnic_connectivity_ping
  4. opal_btl_usnic_connectivity_unlisten
  5. opal_btl_usnic_connectivity_client_finalize

   1 /*
   2  * Copyright (c) 2014-2016 Cisco Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2015      Research Organization for Information Science
   4  *                         and Technology (RIST). All rights reserved.
   5  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
   6  * $COPYRIGHT$
   7  *
   8  * Additional copyrights may follow
   9  *
  10  * $HEADER$
  11  */
  12 
  13 #include "opal_config.h"
  14 
  15 #include <assert.h>
  16 #include <sys/stat.h>
  17 #include <sys/socket.h>
  18 #include <sys/un.h>
  19 #include <sys/types.h>
  20 #include <string.h>
  21 #include <unistd.h>
  22 #ifdef HAVE_ALLOCA_H
  23 #include <alloca.h>
  24 #endif
  25 #include <time.h>
  26 
  27 #include "opal_stdint.h"
  28 #include "opal/threads/mutex.h"
  29 #include "opal/mca/event/event.h"
  30 #include "opal/util/output.h"
  31 #include "opal/util/fd.h"
  32 #include "opal/util/string_copy.h"
  33 #include "opal/util/printf.h"
  34 
  35 #include "btl_usnic.h"
  36 #include "btl_usnic_module.h"
  37 #include "btl_usnic_connectivity.h"
  38 
  39 /**************************************************************************
  40  * Client-side data and methods
  41  **************************************************************************/
  42 
  43 static bool initialized = false;
  44 static int agent_fd = -1;
  45 
  46 
  47 /*
  48  * Startup the agent and share our MCA param values with the it.
  49  */
  50 int opal_btl_usnic_connectivity_client_init(void)
  51 {
  52     /* If connectivity checking is not enabled, do nothing */
  53     if (!mca_btl_usnic_component.connectivity_enabled) {
  54         return OPAL_SUCCESS;
  55     }
  56     assert(!initialized);
  57 
  58     /* Open local IPC socket to the agent */
  59     agent_fd = socket(PF_UNIX, SOCK_STREAM, 0);
  60     if (agent_fd < 0) {
  61         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
  62         ABORT("socket() failed");
  63         /* Will not return */
  64     }
  65 
  66     char *ipc_filename = NULL;
  67     opal_asprintf(&ipc_filename, "%s/%s",
  68              opal_process_info.job_session_dir, CONNECTIVITY_SOCK_NAME);
  69     if (NULL == ipc_filename) {
  70         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
  71         ABORT("Out of memory");
  72         /* Will not return */
  73     }
  74 #if !defined(NDEBUG)
  75     struct sockaddr_un sun;
  76     assert(strlen(ipc_filename) <= sizeof(sun.sun_path));
  77 #endif
  78 
  79     /* Wait for the agent to create its socket.  Timeout after 10
  80        seconds if we don't find the socket. */
  81     struct stat sbuf;
  82     time_t start = time(NULL);
  83     while (1) {
  84         int ret = stat(ipc_filename, &sbuf);
  85         if (0 == ret) {
  86             break;
  87         } else if (ENOENT != errno) {
  88             /* If the error wasn't "file not found", then something
  89                else Bad happened */
  90             OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
  91             ABORT("stat() failed");
  92             /* Will not return */
  93         }
  94 
  95         /* If the named socket wasn't there yet, then give the agent a
  96            little time to establish it */
  97         usleep(1);
  98 
  99         if (time(NULL) - start > 10) {
 100             ABORT("connectivity client timeout waiting for server socket to appear");
 101             /* Will not return */
 102         }
 103     }
 104 
 105     /* Connect */
 106     struct sockaddr_un address;
 107     memset(&address, 0, sizeof(struct sockaddr_un));
 108     address.sun_family = AF_UNIX;
 109     opal_string_copy(address.sun_path, ipc_filename, sizeof(address.sun_path));
 110 
 111     int count = 0;
 112     while (1) {
 113         int ret = connect(agent_fd, (struct sockaddr*) &address,
 114                           sizeof(address));
 115         if (0 == ret) {
 116             break;
 117         }
 118 
 119         // If we get ECONNREFUSED, delay a little and try again
 120         if (ECONNREFUSED == errno) {
 121             if (count < mca_btl_usnic_component.connectivity_num_retries) {
 122                 usleep(100);
 123                 ++count;
 124                 continue;
 125             }
 126         }
 127 
 128         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 129         ABORT("connect() failed");
 130         /* Will not return */
 131     }
 132 
 133     /* Send the magic token */
 134     int tlen = strlen(CONNECTIVITY_MAGIC_TOKEN);
 135     if (OPAL_SUCCESS != opal_fd_write(agent_fd, tlen,
 136                                       CONNECTIVITY_MAGIC_TOKEN)) {
 137         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 138         ABORT("usnic connectivity client IPC connect write failed");
 139         /* Will not return */
 140     }
 141 
 142     /* Receive a magic token back */
 143     char *ack = alloca(tlen + 1);
 144     if (NULL == ack) {
 145         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 146         ABORT("Out of memory");
 147         /* Will not return */
 148     }
 149     if (OPAL_SUCCESS != opal_fd_read(agent_fd, tlen, ack)) {
 150         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 151         ABORT("usnic connectivity client IPC connect read failed");
 152         /* Will not return */
 153     }
 154     if (memcmp(ack, CONNECTIVITY_MAGIC_TOKEN, tlen) != 0) {
 155         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 156         ABORT("usnic connectivity client got wrong token back from agent");
 157         /* Will not return */
 158     }
 159 
 160     /* All done */
 161     initialized = true;
 162     opal_output_verbose(20, USNIC_OUT,
 163                         "usNIC connectivity client initialized");
 164     return OPAL_SUCCESS;
 165 }
 166 
 167 
 168 /*
 169  * Send a listen command to the agent
 170  */
 171 int opal_btl_usnic_connectivity_listen(opal_btl_usnic_module_t *module)
 172 {
 173     /* If connectivity checking is not enabled, do nothing */
 174     if (!mca_btl_usnic_component.connectivity_enabled) {
 175         module->local_modex.connectivity_udp_port = 0;
 176         return OPAL_SUCCESS;
 177     }
 178 
 179     /* Send the LISTEN command */
 180     int id = CONNECTIVITY_AGENT_CMD_LISTEN;
 181     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) {
 182         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 183         ABORT("usnic connectivity client IPC write failed");
 184         /* Will not return */
 185     }
 186 
 187     /* Send the LISTEN command parameters */
 188     opal_btl_usnic_connectivity_cmd_listen_t cmd = {
 189         .module = NULL,
 190         .ipv4_addr = module->local_modex.ipv4_addr,
 191         .netmask = module->local_modex.netmask,
 192         .max_msg_size = module->local_modex.max_msg_size
 193     };
 194     /* Only the MPI process who is also the agent will send the
 195        pointer value (it doesn't make sense otherwise) */
 196     if (0 == opal_process_info.my_local_rank) {
 197         cmd.module = module;
 198     }
 199 
 200     /* Ensure to NULL-terminate the passed strings */
 201     opal_string_copy(cmd.nodename, opal_process_info.nodename,
 202             CONNECTIVITY_NODENAME_LEN);
 203     opal_string_copy(cmd.usnic_name, module->linux_device_name,
 204             CONNECTIVITY_IFNAME_LEN);
 205 
 206     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) {
 207         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 208         ABORT("usnic connectivity client IPC write failed");
 209         /* Will not return */
 210     }
 211 
 212     /* Wait for the reply with the UDP port */
 213     opal_btl_usnic_connectivity_cmd_listen_reply_t reply;
 214     memset(&reply, 0, sizeof(reply));
 215     if (OPAL_SUCCESS != opal_fd_read(agent_fd, sizeof(reply), &reply)) {
 216         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 217         ABORT("usnic connectivity client IPC read failed");
 218         /* Will not return */
 219     }
 220 
 221     /* Get the UDP port number that was received */
 222     assert(CONNECTIVITY_AGENT_CMD_LISTEN == reply.cmd);
 223     module->local_modex.connectivity_udp_port = reply.udp_port;
 224 
 225     return OPAL_SUCCESS;
 226 }
 227 
 228 
 229 int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port,
 230                                      uint32_t dest_ipv4_addr,
 231                                      uint32_t dest_netmask, int dest_port,
 232                                      char *dest_nodename,
 233                                      size_t max_msg_size)
 234 {
 235     /* If connectivity checking is not enabled, do nothing */
 236     if (!mca_btl_usnic_component.connectivity_enabled) {
 237         return OPAL_SUCCESS;
 238     }
 239 
 240     /* Protect opal_fd_write for multithreaded case */
 241     OPAL_THREAD_LOCK(&btl_usnic_lock);
 242 
 243     /* Send the PING command */
 244     int id = CONNECTIVITY_AGENT_CMD_PING;
 245     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) {
 246         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 247         ABORT("usnic connectivity client IPC write failed");
 248         /* Will not return */
 249     }
 250 
 251     /* Send the PING command parameters */
 252     opal_btl_usnic_connectivity_cmd_ping_t cmd = {
 253         .src_ipv4_addr = src_ipv4_addr,
 254         .src_udp_port = src_port,
 255         .dest_ipv4_addr = dest_ipv4_addr,
 256         .dest_netmask = dest_netmask,
 257         .dest_udp_port = dest_port,
 258         .max_msg_size = max_msg_size
 259     };
 260     /* Ensure to NULL-terminate the passed string */
 261     opal_string_copy(cmd.dest_nodename, dest_nodename, CONNECTIVITY_NODENAME_LEN);
 262 
 263     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) {
 264         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 265         ABORT("usnic connectivity client IPC write failed");
 266         /* Will not return */
 267     }
 268 
 269     /* Unlock and return */
 270     OPAL_THREAD_UNLOCK(&btl_usnic_lock);
 271 
 272     return OPAL_SUCCESS;
 273 }
 274 
 275 
 276 /*
 277  * Send an unlisten command to the agent
 278  */
 279 int opal_btl_usnic_connectivity_unlisten(opal_btl_usnic_module_t *module)
 280 {
 281     /* If connectivity checking is not enabled, do nothing */
 282     if (!mca_btl_usnic_component.connectivity_enabled) {
 283         return OPAL_SUCCESS;
 284     }
 285     /* Only the MPI process who is also the agent will send the
 286        UNLISTEN command */
 287     if (0 != opal_process_info.my_local_rank) {
 288         return OPAL_SUCCESS;
 289     }
 290 
 291     /* Send the UNLISTEN command */
 292     int id = CONNECTIVITY_AGENT_CMD_UNLISTEN;
 293     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) {
 294         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 295         ABORT("usnic connectivity client IPC write failed");
 296         /* Will not return */
 297     }
 298 
 299     /* Send the UNLISTEN command parameters */
 300     opal_btl_usnic_connectivity_cmd_unlisten_t cmd = {
 301         .ipv4_addr = module->local_modex.ipv4_addr,
 302     };
 303 
 304     if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) {
 305         OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO);
 306         ABORT("usnic connectivity client IPC write failed");
 307         /* Will not return */
 308     }
 309 
 310     return OPAL_SUCCESS;
 311 }
 312 
 313 
 314 /*
 315  * Shut down the connectivity client
 316  */
 317 int opal_btl_usnic_connectivity_client_finalize(void)
 318 {
 319     /* Make it safe to finalize, even if we weren't initialized */
 320     if (!initialized) {
 321         return OPAL_SUCCESS;
 322     }
 323 
 324     close(agent_fd);
 325     agent_fd = -1;
 326 
 327     initialized = false;
 328     return OPAL_SUCCESS;
 329 }

/* [<][>][^][v][top][bottom][index][help] */