root/opal/mca/btl/usnic/btl_usnic_connectivity.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. opal_btl_usnic_check_connectivity

   1 /*
   2  * Copyright (c) 2014-2019 Cisco Systems, Inc.  All rights reserved
   3  * Copyright (c) 2014      Intel, Inc. All rights reserved
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  */
  10 
  11 #ifndef OPAL_BTL_USNIC_CONNECTIVITY_H
  12 #define OPAL_BTL_USNIC_CONNECTIVITY_H
  13 
  14 #include "opal_config.h"
  15 
  16 #include "opal/util/show_help.h"
  17 
  18 #include "opal/util/proc.h"
  19 
  20 #include "btl_usnic_compat.h"
  21 #include "btl_usnic_module.h"
  22 #include "btl_usnic_proc.h"
  23 #include "btl_usnic_util.h"
  24 
  25 
  26 /**
  27  * Agent-based service to verify UDP connectivity between two peers.
  28  *
  29  * Generally, it is a client-server pattern with three entities
  30  * involved:
  31  *
  32  * 1. Agent thread: running in MPI process local rank 0
  33  * 2. Client: running in the main application thread in every MPI process
  34  * 3. RTE thread: running in every MPI process
  35  *
  36  * If enabled (via MCA param), the usnic module_init() will setup the
  37  * client (and server on local rank 0).  For each usnic module, Each
  38  * client will send a request to the server (via local Unix domain
  39  * socket) asking it to listen on its usnic interface.  The agent will
  40  * discard duplicates and setup a single UDP socket listener on the
  41  * eth interface corresponding to each requested usnic interface.  The
  42  * agent returns the listening UDP port number to the client, and each
  43  * client puts this UDP port number in their modex information.
  44  *
  45  * At the first send to a given MPI process peer, the client will send
  46  * another request to the server asking it to verify connectivity to
  47  * the peer (supplying the peer's UDP listener port number from the
  48  * peer's modex info).  Again, the agent will discard duplicates -- it
  49  * will only verify connectivity to each peer's *server* once.  The
  50  * agent will send a short UDP message and a long UDP message
  51  * (basically, the MTU-68 bytes -- see comment in btl_usnic_cagent.c
  52  * for the reasons why) to the listening peer UDP port.
  53  *
  54  * When the peer agent gets PING messages, it sends short ACK control
  55  * messages back to the sending agent.  When the sending agent gets
  56  * all ACKs back from the peer, it rules that connectivity is GOOD and
  57  * no further action is taken.  If the sending agent doesn't get one
  58  * or both ACKs back in a timely fashion, it re-sends the PING(s) that
  59  * wasn't(weren't) ACKed.  Eventually if the sending agent re-sends
  60  * too many times and does not get an ACK back, it gives up, displays
  61  * and error, and aborts the MPI job.
  62  *
  63  * Note that the client/server interaction is intentionally quite
  64  * primitive:
  65  *
  66  * 1. Client requests agent to listen on interface X.  Server responds
  67  * with UDP port number of listener.
  68  *
  69  * 2. Client requests ping check to peer Y.  Client does not wait for
  70  * the answer; the agent either verifies the connectivity successfully
  71  * or aborts the job.
  72  *
  73  * All client/agent communication is via blocking calls to a local
  74  * Unix domain socket.
  75  *
  76  * As mentioned above, the agent is smart about discarding duplicate
  77  * ping requests from clients.  Since a single agent serves all MPI
  78  * processes on a given server, this cuts down on a lot of PING
  79  * traffic.
  80  */
  81 
  82 /*
  83  * Forward declaration
  84  */
  85 struct opal_btl_usnic_module_t;
  86 
  87 /** @internal
  88  * This macro just makes the macros below a little easier to read.
  89  */
  90 #define ABORT(msg) opal_btl_usnic_util_abort((msg), __FILE__, __LINE__)
  91 
  92 /**
  93  * Local IPC socket message types.  This value is either sent or
  94  * packed as the first field in each message to identify its type.
  95  * Use a non-zero value as the first enum just as defensive
  96  * programming (i.e., it's a slightly lower chance that an
  97  * uninitialized message type would randomly match these values).
  98  */
  99 enum {
 100     CONNECTIVITY_AGENT_CMD_LISTEN = 17,
 101     CONNECTIVITY_AGENT_CMD_PING,
 102     CONNECTIVITY_AGENT_CMD_UNLISTEN,
 103     CONNECTIVITY_AGENT_CMD_MAX
 104 };
 105 
 106 #define CONNECTIVITY_NODENAME_LEN 128
 107 #define CONNECTIVITY_IFNAME_LEN 32
 108 
 109 /*
 110  * Unix domain socket name
 111  */
 112 #define CONNECTIVITY_SOCK_NAME "btl-usnic-cagent-socket"
 113 
 114 /*
 115  * Magic token to ensure that client/server recognize each other
 116  */
 117 #define CONNECTIVITY_MAGIC_TOKEN "-*-I am usNIC; hear me roar-*-"
 118 
 119 /*
 120  * Fields for the LISTEN command.  This struct is sent down the IPC
 121  * socket from the cclient to the cagent.
 122  */
 123 typedef struct {
 124     void *module;
 125     uint32_t ipv4_addr;
 126     uint32_t netmask;
 127     uint32_t max_msg_size;
 128     char nodename[CONNECTIVITY_NODENAME_LEN];
 129     char usnic_name[CONNECTIVITY_IFNAME_LEN];
 130 } opal_btl_usnic_connectivity_cmd_listen_t;
 131 
 132 /*
 133  * Fields for the UNLISTEN command.  This struct is sent down the IPC
 134  * socket from the cclient to the cagent.
 135  */
 136 typedef struct {
 137     uint32_t ipv4_addr;
 138 } opal_btl_usnic_connectivity_cmd_unlisten_t;
 139 
 140 /*
 141  * Command+fields for the reply to the LISTEN command.  This struct is
 142  * sent down the IPC socket from the cagent to the cclient.
 143  */
 144 typedef struct {
 145     int32_t cmd;
 146     uint32_t ipv4_addr;
 147     uint32_t udp_port;
 148 } opal_btl_usnic_connectivity_cmd_listen_reply_t;
 149 
 150 /*
 151  * Fields for the PING command.  This struct is sent down the IPC
 152  * socket from the cclient to the cagent.
 153  */
 154 typedef struct {
 155     uint32_t src_ipv4_addr;
 156     uint32_t src_udp_port;
 157     uint32_t dest_ipv4_addr;
 158     uint32_t dest_netmask;
 159     uint32_t dest_udp_port;
 160     uint32_t max_msg_size;
 161     char dest_nodename[CONNECTIVITY_NODENAME_LEN];
 162 } opal_btl_usnic_connectivity_cmd_ping_t;
 163 
 164 /**
 165  * Startup the connectivity client.
 166  *
 167  * @returns OPAL_SUCCESS or an OPAL error code.
 168  *
 169  * It is safe to call this function even if the connectivity check is
 170  * disabled; it will be a no-op in this case.
 171  */
 172 int opal_btl_usnic_connectivity_client_init(void);
 173 
 174 /**
 175  * Tell the agent to establsh a listening port on the given IP
 176  * address.
 177  *
 178  * @params[in] module The module that is requesting the listen.
 179  *
 180  * @returns OPAL_SUCCESS or an OPAL error code.
 181  *
 182  * The module contains the local interface addressing information,
 183  * which tells the agent on which interface to listen.
 184  *
 185  * This routine will request the new listen from the agent, and wait
 186  * for the agent to reply with the UDP port that is being used/was
 187  * created.  The UDP listening port will then be stuffed in
 188  * module->local_modex.connectivity_udp_port (i.e., data that will be
 189  * sent in the modex).
 190  *
 191  * It is safe to call this function even if the connectivity check is
 192  * disabled; it will be a no-op in this case.
 193  */
 194 int opal_btl_usnic_connectivity_listen(struct opal_btl_usnic_module_t *module);
 195 
 196 /**
 197  * Tell the agent to ping a specific IP address and UDP port number
 198  * with a specific message size.
 199  *
 200  * @param[in] src_ipv4_addr The source module IPv4 address
 201  * @param[in] src_port The source module listening UDP port
 202  * @param[in] dest_ipv4_addr The destination IPv4 address
 203  * @param[in] dest_netmask The destination netmask
 204  * @param[in] dest_port The destination UDP port
 205  * @param[in] dest_nodename The destination server name
 206  * @param[in] max_msg_size The max ping message size to send
 207  *
 208  * @returns OPAL_SUCCESS or an OPAL error code.
 209  *
 210  * Note that several of the above parameters are only passed so that
 211  * they can be used in a complete/helpful error message, if necessary.
 212  *
 213  * This function does not wait for a reply from the agent; it assumes
 214  * the agent will successfully ping the remote peer or will abort the
 215  * MPI job if the pinging fails.
 216  *
 217  * It is safe to call this function even if the connectivity check is
 218  * disabled; it will be a no-op in this case.
 219  */
 220 int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port,
 221                                      uint32_t dest_ipv4_addr,
 222                                      uint32_t dest_netmask, int dest_port,
 223                                      char *dest_nodename,
 224                                      size_t max_msg_size);
 225 
 226 /**
 227  * Tell the agent to stop listening on the given IP address.
 228  *
 229  * @params[in] module The module that is requesting the unlisten.
 230  *
 231  * @returns OPAL_SUCCESS or an OPAL error code.
 232  *
 233  * The module contains the local interface addressing information,
 234  * which tells the agent on which interface to stop listening.
 235  *
 236  * It is safe to call this function even if the connectivity check is
 237  * disabled; it will be a no-op in this case.
 238  */
 239 int opal_btl_usnic_connectivity_unlisten(struct opal_btl_usnic_module_t *module);
 240 
 241 /**
 242  * Shut down the connectivity service client.
 243  *
 244  * @returns OPAL_SUCCESS or an OPAL error code.
 245  *
 246  * It is safe to call this function even if the connectivity check is
 247  * disabled; it will be a no-op in this case.
 248  */
 249 int opal_btl_usnic_connectivity_client_finalize(void);
 250 
 251 /**
 252  * Startup the connectivity agent.
 253  *
 254  * @returns OPAL_SUCCESS or an OPAL error code.
 255  *
 256  * This function will be a no-op if this process is not the local rank
 257  * 0.
 258  */
 259 int opal_btl_usnic_connectivity_agent_init(void);
 260 
 261 /**
 262  * Shut down the connectivity agent
 263  *
 264  * @returns OPAL_SUCCESS or an OPAL error code.
 265  *
 266  * This function will be a no-op if this process is not the local rank
 267  * 0.
 268  */
 269 int opal_btl_usnic_connectivity_agent_finalize(void);
 270 
 271 
 272 /**
 273  * Helper function invoked in the BTL that will invoke a ping, if the
 274  * ping hasn't already been invoked.
 275  */
 276 static inline void
 277 opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module,
 278                                   opal_btl_usnic_endpoint_t *endpoint)
 279 {
 280     if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
 281         OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
 282         opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
 283                                          module->local_modex.connectivity_udp_port,
 284                                          endpoint->endpoint_remote_modex.ipv4_addr,
 285                                          endpoint->endpoint_remote_modex.netmask,
 286                                          endpoint->endpoint_remote_modex.connectivity_udp_port,
 287                                          opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
 288                                          endpoint->endpoint_remote_modex.max_msg_size);
 289         endpoint->endpoint_connectivity_checked = true;
 290     }
 291 }
 292 
 293 #endif /* OPAL_BTL_USNIC_CONNECITIVITY_H */

/* [<][>][^][v][top][bottom][index][help] */