1 /*
2 * Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
3 * Copyright (c) 2014 Intel, Inc. All rights reserved
4 * $COPYRIGHT$
5 *
6 * Additional copyrights may follow
7 *
8 * $HEADER$
9 */
10
11 #ifndef OPAL_BTL_USNIC_CONNECTIVITY_H
12 #define OPAL_BTL_USNIC_CONNECTIVITY_H
13
14 #include "opal_config.h"
15
16 #include "opal/util/show_help.h"
17
18 #include "opal/util/proc.h"
19
20 #include "btl_usnic_compat.h"
21 #include "btl_usnic_module.h"
22 #include "btl_usnic_proc.h"
23 #include "btl_usnic_util.h"
24
25
26 /**
27 * Agent-based service to verify UDP connectivity between two peers.
28 *
29 * Generally, it is a client-server pattern with three entities
30 * involved:
31 *
32 * 1. Agent thread: running in MPI process local rank 0
33 * 2. Client: running in the main application thread in every MPI process
34 * 3. RTE thread: running in every MPI process
35 *
36 * If enabled (via MCA param), the usnic module_init() will setup the
37 * client (and server on local rank 0). For each usnic module, Each
38 * client will send a request to the server (via local Unix domain
39 * socket) asking it to listen on its usnic interface. The agent will
40 * discard duplicates and setup a single UDP socket listener on the
41 * eth interface corresponding to each requested usnic interface. The
42 * agent returns the listening UDP port number to the client, and each
43 * client puts this UDP port number in their modex information.
44 *
45 * At the first send to a given MPI process peer, the client will send
46 * another request to the server asking it to verify connectivity to
47 * the peer (supplying the peer's UDP listener port number from the
48 * peer's modex info). Again, the agent will discard duplicates -- it
49 * will only verify connectivity to each peer's *server* once. The
50 * agent will send a short UDP message and a long UDP message
51 * (basically, the MTU-68 bytes -- see comment in btl_usnic_cagent.c
52 * for the reasons why) to the listening peer UDP port.
53 *
54 * When the peer agent gets PING messages, it sends short ACK control
55 * messages back to the sending agent. When the sending agent gets
56 * all ACKs back from the peer, it rules that connectivity is GOOD and
57 * no further action is taken. If the sending agent doesn't get one
58 * or both ACKs back in a timely fashion, it re-sends the PING(s) that
59 * wasn't(weren't) ACKed. Eventually if the sending agent re-sends
60 * too many times and does not get an ACK back, it gives up, displays
61 * and error, and aborts the MPI job.
62 *
63 * Note that the client/server interaction is intentionally quite
64 * primitive:
65 *
66 * 1. Client requests agent to listen on interface X. Server responds
67 * with UDP port number of listener.
68 *
69 * 2. Client requests ping check to peer Y. Client does not wait for
70 * the answer; the agent either verifies the connectivity successfully
71 * or aborts the job.
72 *
73 * All client/agent communication is via blocking calls to a local
74 * Unix domain socket.
75 *
76 * As mentioned above, the agent is smart about discarding duplicate
77 * ping requests from clients. Since a single agent serves all MPI
78 * processes on a given server, this cuts down on a lot of PING
79 * traffic.
80 */
81
82 /*
83 * Forward declaration
84 */
85 struct opal_btl_usnic_module_t;
86
87 /** @internal
88 * This macro just makes the macros below a little easier to read.
89 */
90 #define ABORT(msg) opal_btl_usnic_util_abort((msg), __FILE__, __LINE__)
91
92 /**
93 * Local IPC socket message types. This value is either sent or
94 * packed as the first field in each message to identify its type.
95 * Use a non-zero value as the first enum just as defensive
96 * programming (i.e., it's a slightly lower chance that an
97 * uninitialized message type would randomly match these values).
98 */
99 enum {
100 CONNECTIVITY_AGENT_CMD_LISTEN = 17,
101 CONNECTIVITY_AGENT_CMD_PING,
102 CONNECTIVITY_AGENT_CMD_UNLISTEN,
103 CONNECTIVITY_AGENT_CMD_MAX
104 };
105
106 #define CONNECTIVITY_NODENAME_LEN 128
107 #define CONNECTIVITY_IFNAME_LEN 32
108
109 /*
110 * Unix domain socket name
111 */
112 #define CONNECTIVITY_SOCK_NAME "btl-usnic-cagent-socket"
113
114 /*
115 * Magic token to ensure that client/server recognize each other
116 */
117 #define CONNECTIVITY_MAGIC_TOKEN "-*-I am usNIC; hear me roar-*-"
118
119 /*
120 * Fields for the LISTEN command. This struct is sent down the IPC
121 * socket from the cclient to the cagent.
122 */
123 typedef struct {
124 void *module;
125 uint32_t ipv4_addr;
126 uint32_t netmask;
127 uint32_t max_msg_size;
128 char nodename[CONNECTIVITY_NODENAME_LEN];
129 char usnic_name[CONNECTIVITY_IFNAME_LEN];
130 } opal_btl_usnic_connectivity_cmd_listen_t;
131
132 /*
133 * Fields for the UNLISTEN command. This struct is sent down the IPC
134 * socket from the cclient to the cagent.
135 */
136 typedef struct {
137 uint32_t ipv4_addr;
138 } opal_btl_usnic_connectivity_cmd_unlisten_t;
139
140 /*
141 * Command+fields for the reply to the LISTEN command. This struct is
142 * sent down the IPC socket from the cagent to the cclient.
143 */
144 typedef struct {
145 int32_t cmd;
146 uint32_t ipv4_addr;
147 uint32_t udp_port;
148 } opal_btl_usnic_connectivity_cmd_listen_reply_t;
149
150 /*
151 * Fields for the PING command. This struct is sent down the IPC
152 * socket from the cclient to the cagent.
153 */
154 typedef struct {
155 uint32_t src_ipv4_addr;
156 uint32_t src_udp_port;
157 uint32_t dest_ipv4_addr;
158 uint32_t dest_netmask;
159 uint32_t dest_udp_port;
160 uint32_t max_msg_size;
161 char dest_nodename[CONNECTIVITY_NODENAME_LEN];
162 } opal_btl_usnic_connectivity_cmd_ping_t;
163
164 /**
165 * Startup the connectivity client.
166 *
167 * @returns OPAL_SUCCESS or an OPAL error code.
168 *
169 * It is safe to call this function even if the connectivity check is
170 * disabled; it will be a no-op in this case.
171 */
172 int opal_btl_usnic_connectivity_client_init(void);
173
174 /**
175 * Tell the agent to establsh a listening port on the given IP
176 * address.
177 *
178 * @params[in] module The module that is requesting the listen.
179 *
180 * @returns OPAL_SUCCESS or an OPAL error code.
181 *
182 * The module contains the local interface addressing information,
183 * which tells the agent on which interface to listen.
184 *
185 * This routine will request the new listen from the agent, and wait
186 * for the agent to reply with the UDP port that is being used/was
187 * created. The UDP listening port will then be stuffed in
188 * module->local_modex.connectivity_udp_port (i.e., data that will be
189 * sent in the modex).
190 *
191 * It is safe to call this function even if the connectivity check is
192 * disabled; it will be a no-op in this case.
193 */
194 int opal_btl_usnic_connectivity_listen(struct opal_btl_usnic_module_t *module);
195
196 /**
197 * Tell the agent to ping a specific IP address and UDP port number
198 * with a specific message size.
199 *
200 * @param[in] src_ipv4_addr The source module IPv4 address
201 * @param[in] src_port The source module listening UDP port
202 * @param[in] dest_ipv4_addr The destination IPv4 address
203 * @param[in] dest_netmask The destination netmask
204 * @param[in] dest_port The destination UDP port
205 * @param[in] dest_nodename The destination server name
206 * @param[in] max_msg_size The max ping message size to send
207 *
208 * @returns OPAL_SUCCESS or an OPAL error code.
209 *
210 * Note that several of the above parameters are only passed so that
211 * they can be used in a complete/helpful error message, if necessary.
212 *
213 * This function does not wait for a reply from the agent; it assumes
214 * the agent will successfully ping the remote peer or will abort the
215 * MPI job if the pinging fails.
216 *
217 * It is safe to call this function even if the connectivity check is
218 * disabled; it will be a no-op in this case.
219 */
220 int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port,
221 uint32_t dest_ipv4_addr,
222 uint32_t dest_netmask, int dest_port,
223 char *dest_nodename,
224 size_t max_msg_size);
225
226 /**
227 * Tell the agent to stop listening on the given IP address.
228 *
229 * @params[in] module The module that is requesting the unlisten.
230 *
231 * @returns OPAL_SUCCESS or an OPAL error code.
232 *
233 * The module contains the local interface addressing information,
234 * which tells the agent on which interface to stop listening.
235 *
236 * It is safe to call this function even if the connectivity check is
237 * disabled; it will be a no-op in this case.
238 */
239 int opal_btl_usnic_connectivity_unlisten(struct opal_btl_usnic_module_t *module);
240
241 /**
242 * Shut down the connectivity service client.
243 *
244 * @returns OPAL_SUCCESS or an OPAL error code.
245 *
246 * It is safe to call this function even if the connectivity check is
247 * disabled; it will be a no-op in this case.
248 */
249 int opal_btl_usnic_connectivity_client_finalize(void);
250
251 /**
252 * Startup the connectivity agent.
253 *
254 * @returns OPAL_SUCCESS or an OPAL error code.
255 *
256 * This function will be a no-op if this process is not the local rank
257 * 0.
258 */
259 int opal_btl_usnic_connectivity_agent_init(void);
260
261 /**
262 * Shut down the connectivity agent
263 *
264 * @returns OPAL_SUCCESS or an OPAL error code.
265 *
266 * This function will be a no-op if this process is not the local rank
267 * 0.
268 */
269 int opal_btl_usnic_connectivity_agent_finalize(void);
270
271
272 /**
273 * Helper function invoked in the BTL that will invoke a ping, if the
274 * ping hasn't already been invoked.
275 */
276 static inline void
277 opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module,
278 opal_btl_usnic_endpoint_t *endpoint)
279 {
280 if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
281 OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
282 opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
283 module->local_modex.connectivity_udp_port,
284 endpoint->endpoint_remote_modex.ipv4_addr,
285 endpoint->endpoint_remote_modex.netmask,
286 endpoint->endpoint_remote_modex.connectivity_udp_port,
287 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
288 endpoint->endpoint_remote_modex.max_msg_size);
289 endpoint->endpoint_connectivity_checked = true;
290 }
291 }
292
293 #endif /* OPAL_BTL_USNIC_CONNECITIVITY_H */