1 /* 2 * Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved 3 * Copyright (c) 2014 Intel, Inc. All rights reserved 4 * $COPYRIGHT$ 5 * 6 * Additional copyrights may follow 7 * 8 * $HEADER$ 9 */ 10 11 #ifndef OPAL_BTL_USNIC_CONNECTIVITY_H 12 #define OPAL_BTL_USNIC_CONNECTIVITY_H 13 14 #include "opal_config.h" 15 16 #include "opal/util/show_help.h" 17 18 #include "opal/util/proc.h" 19 20 #include "btl_usnic_compat.h" 21 #include "btl_usnic_module.h" 22 #include "btl_usnic_proc.h" 23 #include "btl_usnic_util.h" 24 25 26 /** 27 * Agent-based service to verify UDP connectivity between two peers. 28 * 29 * Generally, it is a client-server pattern with three entities 30 * involved: 31 * 32 * 1. Agent thread: running in MPI process local rank 0 33 * 2. Client: running in the main application thread in every MPI process 34 * 3. RTE thread: running in every MPI process 35 * 36 * If enabled (via MCA param), the usnic module_init() will setup the 37 * client (and server on local rank 0). For each usnic module, Each 38 * client will send a request to the server (via local Unix domain 39 * socket) asking it to listen on its usnic interface. The agent will 40 * discard duplicates and setup a single UDP socket listener on the 41 * eth interface corresponding to each requested usnic interface. The 42 * agent returns the listening UDP port number to the client, and each 43 * client puts this UDP port number in their modex information. 44 * 45 * At the first send to a given MPI process peer, the client will send 46 * another request to the server asking it to verify connectivity to 47 * the peer (supplying the peer's UDP listener port number from the 48 * peer's modex info). Again, the agent will discard duplicates -- it 49 * will only verify connectivity to each peer's *server* once. The 50 * agent will send a short UDP message and a long UDP message 51 * (basically, the MTU-68 bytes -- see comment in btl_usnic_cagent.c 52 * for the reasons why) to the listening peer UDP port. 53 * 54 * When the peer agent gets PING messages, it sends short ACK control 55 * messages back to the sending agent. When the sending agent gets 56 * all ACKs back from the peer, it rules that connectivity is GOOD and 57 * no further action is taken. If the sending agent doesn't get one 58 * or both ACKs back in a timely fashion, it re-sends the PING(s) that 59 * wasn't(weren't) ACKed. Eventually if the sending agent re-sends 60 * too many times and does not get an ACK back, it gives up, displays 61 * and error, and aborts the MPI job. 62 * 63 * Note that the client/server interaction is intentionally quite 64 * primitive: 65 * 66 * 1. Client requests agent to listen on interface X. Server responds 67 * with UDP port number of listener. 68 * 69 * 2. Client requests ping check to peer Y. Client does not wait for 70 * the answer; the agent either verifies the connectivity successfully 71 * or aborts the job. 72 * 73 * All client/agent communication is via blocking calls to a local 74 * Unix domain socket. 75 * 76 * As mentioned above, the agent is smart about discarding duplicate 77 * ping requests from clients. Since a single agent serves all MPI 78 * processes on a given server, this cuts down on a lot of PING 79 * traffic. 80 */ 81 82 /* 83 * Forward declaration 84 */ 85 struct opal_btl_usnic_module_t; 86 87 /** @internal 88 * This macro just makes the macros below a little easier to read. 89 */ 90 #define ABORT(msg) opal_btl_usnic_util_abort((msg), __FILE__, __LINE__) 91 92 /** 93 * Local IPC socket message types. This value is either sent or 94 * packed as the first field in each message to identify its type. 95 * Use a non-zero value as the first enum just as defensive 96 * programming (i.e., it's a slightly lower chance that an 97 * uninitialized message type would randomly match these values). 98 */ 99 enum { 100 CONNECTIVITY_AGENT_CMD_LISTEN = 17, 101 CONNECTIVITY_AGENT_CMD_PING, 102 CONNECTIVITY_AGENT_CMD_UNLISTEN, 103 CONNECTIVITY_AGENT_CMD_MAX 104 }; 105 106 #define CONNECTIVITY_NODENAME_LEN 128 107 #define CONNECTIVITY_IFNAME_LEN 32 108 109 /* 110 * Unix domain socket name 111 */ 112 #define CONNECTIVITY_SOCK_NAME "btl-usnic-cagent-socket" 113 114 /* 115 * Magic token to ensure that client/server recognize each other 116 */ 117 #define CONNECTIVITY_MAGIC_TOKEN "-*-I am usNIC; hear me roar-*-" 118 119 /* 120 * Fields for the LISTEN command. This struct is sent down the IPC 121 * socket from the cclient to the cagent. 122 */ 123 typedef struct { 124 void *module; 125 uint32_t ipv4_addr; 126 uint32_t netmask; 127 uint32_t max_msg_size; 128 char nodename[CONNECTIVITY_NODENAME_LEN]; 129 char usnic_name[CONNECTIVITY_IFNAME_LEN]; 130 } opal_btl_usnic_connectivity_cmd_listen_t; 131 132 /* 133 * Fields for the UNLISTEN command. This struct is sent down the IPC 134 * socket from the cclient to the cagent. 135 */ 136 typedef struct { 137 uint32_t ipv4_addr; 138 } opal_btl_usnic_connectivity_cmd_unlisten_t; 139 140 /* 141 * Command+fields for the reply to the LISTEN command. This struct is 142 * sent down the IPC socket from the cagent to the cclient. 143 */ 144 typedef struct { 145 int32_t cmd; 146 uint32_t ipv4_addr; 147 uint32_t udp_port; 148 } opal_btl_usnic_connectivity_cmd_listen_reply_t; 149 150 /* 151 * Fields for the PING command. This struct is sent down the IPC 152 * socket from the cclient to the cagent. 153 */ 154 typedef struct { 155 uint32_t src_ipv4_addr; 156 uint32_t src_udp_port; 157 uint32_t dest_ipv4_addr; 158 uint32_t dest_netmask; 159 uint32_t dest_udp_port; 160 uint32_t max_msg_size; 161 char dest_nodename[CONNECTIVITY_NODENAME_LEN]; 162 } opal_btl_usnic_connectivity_cmd_ping_t; 163 164 /** 165 * Startup the connectivity client. 166 * 167 * @returns OPAL_SUCCESS or an OPAL error code. 168 * 169 * It is safe to call this function even if the connectivity check is 170 * disabled; it will be a no-op in this case. 171 */ 172 int opal_btl_usnic_connectivity_client_init(void); 173 174 /** 175 * Tell the agent to establsh a listening port on the given IP 176 * address. 177 * 178 * @params[in] module The module that is requesting the listen. 179 * 180 * @returns OPAL_SUCCESS or an OPAL error code. 181 * 182 * The module contains the local interface addressing information, 183 * which tells the agent on which interface to listen. 184 * 185 * This routine will request the new listen from the agent, and wait 186 * for the agent to reply with the UDP port that is being used/was 187 * created. The UDP listening port will then be stuffed in 188 * module->local_modex.connectivity_udp_port (i.e., data that will be 189 * sent in the modex). 190 * 191 * It is safe to call this function even if the connectivity check is 192 * disabled; it will be a no-op in this case. 193 */ 194 int opal_btl_usnic_connectivity_listen(struct opal_btl_usnic_module_t *module); 195 196 /** 197 * Tell the agent to ping a specific IP address and UDP port number 198 * with a specific message size. 199 * 200 * @param[in] src_ipv4_addr The source module IPv4 address 201 * @param[in] src_port The source module listening UDP port 202 * @param[in] dest_ipv4_addr The destination IPv4 address 203 * @param[in] dest_netmask The destination netmask 204 * @param[in] dest_port The destination UDP port 205 * @param[in] dest_nodename The destination server name 206 * @param[in] max_msg_size The max ping message size to send 207 * 208 * @returns OPAL_SUCCESS or an OPAL error code. 209 * 210 * Note that several of the above parameters are only passed so that 211 * they can be used in a complete/helpful error message, if necessary. 212 * 213 * This function does not wait for a reply from the agent; it assumes 214 * the agent will successfully ping the remote peer or will abort the 215 * MPI job if the pinging fails. 216 * 217 * It is safe to call this function even if the connectivity check is 218 * disabled; it will be a no-op in this case. 219 */ 220 int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, 221 uint32_t dest_ipv4_addr, 222 uint32_t dest_netmask, int dest_port, 223 char *dest_nodename, 224 size_t max_msg_size); 225 226 /** 227 * Tell the agent to stop listening on the given IP address. 228 * 229 * @params[in] module The module that is requesting the unlisten. 230 * 231 * @returns OPAL_SUCCESS or an OPAL error code. 232 * 233 * The module contains the local interface addressing information, 234 * which tells the agent on which interface to stop listening. 235 * 236 * It is safe to call this function even if the connectivity check is 237 * disabled; it will be a no-op in this case. 238 */ 239 int opal_btl_usnic_connectivity_unlisten(struct opal_btl_usnic_module_t *module); 240 241 /** 242 * Shut down the connectivity service client. 243 * 244 * @returns OPAL_SUCCESS or an OPAL error code. 245 * 246 * It is safe to call this function even if the connectivity check is 247 * disabled; it will be a no-op in this case. 248 */ 249 int opal_btl_usnic_connectivity_client_finalize(void); 250 251 /** 252 * Startup the connectivity agent. 253 * 254 * @returns OPAL_SUCCESS or an OPAL error code. 255 * 256 * This function will be a no-op if this process is not the local rank 257 * 0. 258 */ 259 int opal_btl_usnic_connectivity_agent_init(void); 260 261 /** 262 * Shut down the connectivity agent 263 * 264 * @returns OPAL_SUCCESS or an OPAL error code. 265 * 266 * This function will be a no-op if this process is not the local rank 267 * 0. 268 */ 269 int opal_btl_usnic_connectivity_agent_finalize(void); 270 271 272 /** 273 * Helper function invoked in the BTL that will invoke a ping, if the 274 * ping hasn't already been invoked. 275 */ 276 static inline void 277 opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module, 278 opal_btl_usnic_endpoint_t *endpoint) 279 { 280 if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) && 281 OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) { 282 opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr, 283 module->local_modex.connectivity_udp_port, 284 endpoint->endpoint_remote_modex.ipv4_addr, 285 endpoint->endpoint_remote_modex.netmask, 286 endpoint->endpoint_remote_modex.connectivity_udp_port, 287 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), 288 endpoint->endpoint_remote_modex.max_msg_size); 289 endpoint->endpoint_connectivity_checked = true; 290 } 291 } 292 293 #endif /* OPAL_BTL_USNIC_CONNECITIVITY_H */