This source file includes following definitions.
- mca_btl_uct_endpoint_construct
- mca_btl_uct_endpoint_destruct
- mca_btl_uct_endpoint_create
- mca_btl_uct_process_modex_tl
- mca_btl_uct_process_modex
- mca_btl_uct_ep_create_connected_compat
- mca_btl_uct_ep_create_compat
- mca_btl_uct_endpoint_connect_iface
- mca_btl_uct_connection_ep_construct
- mca_btl_uct_connection_ep_destruct
- mca_btl_uct_endpoint_flush_complete
- mca_btl_uct_endpoint_send_conn_req
- mca_btl_uct_endpoint_connect_endpoint
- mca_btl_uct_endpoint_connect
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 #include "btl_uct.h"
16 #include "btl_uct_endpoint.h"
17 #include "btl_uct_device_context.h"
18 #include "btl_uct_am.h"
19 #include "opal/util/proc.h"
20
21 static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint)
22 {
23 memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module);
24 endpoint->conn_ep = NULL;
25 OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t);
26 }
27
28 static void mca_btl_uct_endpoint_destruct (mca_btl_uct_endpoint_t *endpoint)
29 {
30 for (int tl_index = 0 ; tl_index < 2 ; ++tl_index) {
31 for (int i = 0 ; i < mca_btl_uct_component.num_contexts_per_module ; ++i) {
32 if (NULL != endpoint->uct_eps[i][tl_index].uct_ep) {
33 uct_ep_destroy (endpoint->uct_eps[i][tl_index].uct_ep);
34 }
35 }
36 }
37
38 OBJ_DESTRUCT(&endpoint->ep_lock);
39 }
40
41 OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t,
42 mca_btl_uct_endpoint_construct,
43 mca_btl_uct_endpoint_destruct);
44
45 mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc)
46 {
47 mca_btl_uct_endpoint_t *endpoint = calloc (1, sizeof (*endpoint) + sizeof (endpoint->uct_eps[0]) *
48 mca_btl_uct_component.num_contexts_per_module);
49
50 if (OPAL_UNLIKELY(NULL == endpoint)) {
51 return NULL;
52 }
53
54 OBJ_CONSTRUCT(endpoint, mca_btl_uct_endpoint_t);
55 endpoint->ep_proc = proc;
56
57 return (mca_btl_base_endpoint_t *) endpoint;
58 }
59
60 static unsigned char *mca_btl_uct_process_modex_tl (unsigned char *modex_data)
61 {
62 BTL_VERBOSE(("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data)));
63
64
65 return modex_data + 4 + strlen ((char *) modex_data + 4) + 1;
66 }
67
68 static void mca_btl_uct_process_modex (mca_btl_uct_module_t *uct_btl, unsigned char *modex_data,
69 unsigned char **rdma_tl_data, unsigned char **am_tl_data,
70 unsigned char **conn_tl_data)
71 {
72 BTL_VERBOSE(("processing remote modex data"));
73
74 if (uct_btl->rdma_tl) {
75 BTL_VERBOSE(("modex contains RDMA data"));
76 if (rdma_tl_data) {
77 *rdma_tl_data = mca_btl_uct_process_modex_tl (modex_data);
78 }
79 modex_data += *((uint32_t *) modex_data);
80 } else if (rdma_tl_data) {
81 *rdma_tl_data = NULL;
82 }
83
84 if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) {
85 BTL_VERBOSE(("modex contains active message data"));
86 if (am_tl_data) {
87 *am_tl_data = mca_btl_uct_process_modex_tl (modex_data);
88 }
89 modex_data += *((uint32_t *) modex_data);
90 } else if (am_tl_data) {
91 *am_tl_data = NULL;
92 }
93
94 if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl && uct_btl->conn_tl != uct_btl->am_tl) {
95 BTL_VERBOSE(("modex contains connection data"));
96 if (conn_tl_data) {
97 *conn_tl_data = mca_btl_uct_process_modex_tl (modex_data);
98 }
99 modex_data += *((uint32_t *) modex_data);
100 } else if (conn_tl_data) {
101 *conn_tl_data = NULL;
102 }
103 }
104
105 static inline ucs_status_t mca_btl_uct_ep_create_connected_compat (uct_iface_h iface, uct_device_addr_t *device_addr,
106 uct_iface_addr_t *iface_addr, uct_ep_h *uct_ep)
107 {
108 #if UCT_API >= UCT_VERSION(1, 6)
109 uct_ep_params_t ep_params = {.field_mask = UCT_EP_PARAM_FIELD_IFACE | UCT_EP_PARAM_FIELD_DEV_ADDR | UCT_EP_PARAM_FIELD_IFACE_ADDR,
110 .iface = iface, .dev_addr = device_addr, .iface_addr = iface_addr};
111 return uct_ep_create (&ep_params, uct_ep);
112 #else
113 return uct_ep_create_connected (iface, device_addr, iface_addr, uct_ep);
114 #endif
115 }
116
117 static inline ucs_status_t mca_btl_uct_ep_create_compat (uct_iface_h iface, uct_ep_h *uct_ep)
118 {
119 #if UCT_API >= UCT_VERSION(1, 6)
120 uct_ep_params_t ep_params = {.field_mask = UCT_EP_PARAM_FIELD_IFACE, .iface = iface};
121 return uct_ep_create (&ep_params, uct_ep);
122 #else
123 return uct_ep_create (iface, uct_ep);
124 #endif
125 }
126
127 static int mca_btl_uct_endpoint_connect_iface (mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *tl,
128 mca_btl_uct_device_context_t *tl_context,
129 mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data)
130 {
131 uct_device_addr_t *device_addr = NULL;
132 uct_iface_addr_t *iface_addr;
133 ucs_status_t ucs_status;
134
135
136 iface_addr = (uct_iface_addr_t *) tl_data;
137 device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).iface_addr_len);
138
139 BTL_VERBOSE(("connecting endpoint to interface"));
140
141 mca_btl_uct_context_lock (tl_context);
142 ucs_status = mca_btl_uct_ep_create_connected_compat (tl_context->uct_iface, device_addr, iface_addr, &tl_endpoint->uct_ep);
143 tl_endpoint->flags = MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY;
144 mca_btl_uct_context_unlock (tl_context);
145
146 return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR;
147 }
148
149 static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t *ep)
150 {
151 ep->uct_ep = NULL;
152 }
153
154 static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep)
155 {
156 if (ep->uct_ep) {
157 uct_ep_destroy (ep->uct_ep);
158 ep->uct_ep = NULL;
159 }
160 }
161
162 OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct,
163 mca_btl_uct_connection_ep_destruct);
164
165 struct mca_btl_uct_conn_completion_t {
166 uct_completion_t super;
167 volatile bool complete;
168 };
169 typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t;
170
171 static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status)
172 {
173 mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self;
174 BTL_VERBOSE(("connection flush complete"));
175 completion->complete = true;
176 }
177
178 static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
179 mca_btl_uct_device_context_t *conn_tl_context,
180 mca_btl_uct_conn_req_t *request, size_t request_length)
181 {
182 mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
183 mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete},
184 .complete = false};
185 ucs_status_t ucs_status;
186
187 BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
188 request->context_id, request->type, request_length));
189
190 OBJ_RETAIN(endpoint->conn_ep);
191
192
193 opal_mutex_unlock (&endpoint->ep_lock);
194
195 do {
196 MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
197 ucs_status = uct_ep_am_short (conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request,
198 request_length);
199 });
200 if (OPAL_LIKELY(UCS_OK == ucs_status)) {
201 break;
202 }
203
204 if (OPAL_UNLIKELY(UCS_ERR_NO_RESOURCE != ucs_status)) {
205 return OPAL_ERROR;
206 }
207
208
209 mca_btl_uct_context_progress (conn_tl_context);
210 } while (1);
211
212
213 ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super);
214 if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
215
216 do {
217 ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
218 mca_btl_uct_context_progress (conn_tl_context);
219 } while (UCS_INPROGRESS == ucs_status);
220 } else {
221 do {
222 mca_btl_uct_context_progress (conn_tl_context);
223 } while (!completion.complete);
224 }
225
226 opal_mutex_lock (&endpoint->ep_lock);
227
228 OBJ_RELEASE(endpoint->conn_ep);
229
230 return OPAL_SUCCESS;
231 }
232
233 static int mca_btl_uct_endpoint_connect_endpoint (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
234 mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context,
235 mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data,
236 uint8_t *conn_tl_data, void *ep_addr)
237 {
238 size_t request_length = sizeof (mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
239 mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
240 mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
241 mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
242 mca_btl_uct_conn_req_t *request = alloca (request_length);
243 uct_device_addr_t *device_addr = NULL;
244 uct_iface_addr_t *iface_addr;
245 ucs_status_t ucs_status;
246 int rc;
247
248 assert (NULL != conn_tl);
249
250 BTL_VERBOSE(("connecting endpoint to remote endpoint"));
251
252 if (NULL == conn_ep) {
253 BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p",
254 opal_process_name_print (endpoint->ep_proc->proc_name)));
255
256 iface_addr = (uct_iface_addr_t *) conn_tl_data;
257 device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len);
258
259 endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t);
260 if (OPAL_UNLIKELY(NULL == conn_ep)) {
261 return OPAL_ERR_OUT_OF_RESOURCE;
262 }
263
264
265 MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
266 ucs_status = mca_btl_uct_ep_create_connected_compat (conn_tl_context->uct_iface, device_addr, iface_addr,
267 &conn_ep->uct_ep);
268 });
269 if (UCS_OK != ucs_status) {
270 BTL_VERBOSE(("could not create an endpoint for forming connection to remote peer. code = %d",
271 ucs_status));
272 return OPAL_ERROR;
273 }
274 } else {
275 OBJ_RETAIN(conn_ep);
276 }
277
278
279 request->proc_name = OPAL_PROC_MY_NAME;
280 request->context_id = tl_context->context_id;
281 request->tl_index = tl->tl_index;
282 request->type = !!(ep_addr);
283
284 if (NULL == tl_endpoint->uct_ep) {
285 BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data",
286 opal_process_name_print (endpoint->ep_proc->proc_name)));
287
288 MCA_BTL_UCT_CONTEXT_SERIALIZE(tl_context, {
289 ucs_status = mca_btl_uct_ep_create_compat (tl_context->uct_iface, &tl_endpoint->uct_ep);
290 });
291 if (UCS_OK != ucs_status) {
292 OBJ_RELEASE(endpoint->conn_ep);
293 return OPAL_ERROR;
294 }
295 }
296
297 if (ep_addr) {
298 BTL_VERBOSE(("using remote endpoint address to connect endpoint for tl %s, index %d. ep_addr = %p",
299 tl->uct_tl_name, tl_context->context_id, ep_addr));
300
301
302 ucs_status = uct_ep_connect_to_ep (tl_endpoint->uct_ep, (uct_device_addr_t *) tl_data, ep_addr);
303 if (UCS_OK != ucs_status) {
304 return OPAL_ERROR;
305 }
306 }
307
308
309 ucs_status = uct_ep_get_address (tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
310 if (UCS_OK != ucs_status) {
311
312 OBJ_RELEASE(endpoint->conn_ep);
313 uct_ep_destroy (tl_endpoint->uct_ep);
314 tl_endpoint->uct_ep = NULL;
315 return OPAL_ERROR;
316 }
317
318
319
320 rc = mca_btl_uct_endpoint_send_conn_req (uct_btl, endpoint, conn_tl_context, request, request_length);
321 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
322 OBJ_RELEASE(endpoint->conn_ep);
323 uct_ep_destroy (tl_endpoint->uct_ep);
324 tl_endpoint->uct_ep = NULL;
325 return OPAL_ERROR;
326 }
327
328 return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS : OPAL_ERR_OUT_OF_RESOURCE;
329 }
330
331 int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id,
332 void *ep_addr, int tl_index)
333 {
334 mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index;
335 mca_btl_uct_tl_t *tl = (uct_btl->rdma_tl && tl_index == uct_btl->rdma_tl->tl_index) ?
336 uct_btl->rdma_tl : uct_btl->am_tl;
337 mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id);
338 uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
339 mca_btl_uct_connection_ep_t *conn_ep = NULL;
340 mca_btl_uct_modex_t *modex;
341 uint8_t *modex_data;
342 size_t msg_size;
343 int rc;
344
345
346 assert (tl_index < 2);
347
348 if (OPAL_UNLIKELY(NULL == tl)) {
349 return OPAL_ERR_UNREACH;
350 }
351
352 BTL_VERBOSE(("checking endpoint %p with context id %d. cached uct ep: %p, ready: %d", (void *) endpoint, context_id,
353 (void *) tl_endpoint->uct_ep, !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags)));
354
355 opal_mutex_lock (&endpoint->ep_lock);
356 if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) {
357 opal_mutex_unlock (&endpoint->ep_lock);
358
359 return OPAL_SUCCESS;
360 }
361
362
363 if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) {
364 opal_mutex_unlock (&endpoint->ep_lock);
365 return OPAL_ERR_OUT_OF_RESOURCE;
366 }
367
368 do {
369
370 OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version,
371 &endpoint->ep_proc->proc_name, (void **)&modex, &msg_size);
372 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
373 BTL_ERROR(("error receiving modex"));
374 break;
375 }
376
377 BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size,
378 OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count));
379 modex_data = modex->data;
380
381
382 for (int i = 0 ; i < modex->module_count ; ++i) {
383 uint32_t modex_size = *((uint32_t *) modex_data);
384
385 BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name));
386
387 modex_data += 4;
388
389 if (0 != strcmp ((char *) modex_data, uct_btl->md_name)) {
390
391 modex_data += modex_size - 4;
392 continue;
393 }
394
395 modex_data += strlen ((char *) modex_data) + 1;
396
397 mca_btl_uct_process_modex (uct_btl, modex_data, &rdma_tl_data, &am_tl_data, &conn_tl_data);
398 break;
399 }
400
401 tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data;
402
403 if (NULL == tl_data) {
404 opal_mutex_unlock (&endpoint->ep_lock);
405 return OPAL_ERR_UNREACH;
406 }
407
408
409 if (!mca_btl_uct_tl_requires_connection_tl (tl)) {
410 rc = mca_btl_uct_endpoint_connect_iface (uct_btl, tl, tl_context, tl_endpoint, tl_data);
411 } else {
412 rc = mca_btl_uct_endpoint_connect_endpoint (uct_btl, endpoint, tl, tl_context, tl_endpoint,
413 tl_data, conn_tl_data, ep_addr);
414 }
415
416 } while (0);
417
418
419 if (endpoint->conn_ep && 1 == endpoint->conn_ep->super.obj_reference_count) {
420 conn_ep = endpoint->conn_ep;
421 endpoint->conn_ep = NULL;
422 }
423
424 opal_mutex_unlock (&endpoint->ep_lock);
425
426 if (conn_ep) {
427 OBJ_RELEASE(conn_ep);
428 }
429
430 BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet"));
431
432 return rc;
433 }