This source file includes following definitions.
- mca_btl_uct_component_register
- mca_btl_uct_mem_release_cb
- mca_btl_uct_component_open
- mca_btl_uct_component_close
- mca_btl_uct_tl_modex_size
- mca_btl_uct_module_modex_size
- mca_btl_uct_tl_modex_pack
- mca_btl_uct_modex_send
- mca_btl_uct_alloc_module
- mca_btl_uct_am_handler
- mca_btl_uct_component_process_uct_md
- mca_btl_uct_component_init
- mca_btl_uct_tl_progress
- mca_btl_uct_component_progress_pending
- mca_btl_uct_component_progress
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 #include "opal_config.h"
30
31 #include "opal/mca/btl/btl.h"
32 #include "opal/mca/btl/base/base.h"
33 #include "opal/mca/hwloc/base/base.h"
34 #include "opal/util/argv.h"
35 #include "opal/memoryhooks/memory.h"
36 #include "opal/mca/memory/base/base.h"
37 #include <ucm/api/ucm.h>
38
39 #include "opal/util/printf.h"
40
41 #include <string.h>
42
43 #include "btl_uct_device_context.h"
44 #include "btl_uct_am.h"
45
46 static int mca_btl_uct_component_register(void)
47 {
48 mca_btl_uct_module_t *module = &mca_btl_uct_module_template;
49
50 mca_btl_uct_component.memory_domains = "none";
51 (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
52 "memory_domains", "Comma-delimited list of memory domains of the form "
53 "to use for communication. Memory domains MUST provide transports that "
54 "support put, get, and amos. Special values: all (all available), none."
55 " (default: none)", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
56 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
57 &mca_btl_uct_component.memory_domains);
58
59 mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any";
60 (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
61 "transports", "Comma-delimited list of transports to use sorted by increasing "
62 "priority. The list of transports available can be queried using ucx_info. Special"
63 "values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)",
64 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
65 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports);
66
67 mca_btl_uct_component.num_contexts_per_module = 0;
68 (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
69 "num_contexts_per_module", "Number of UCT worker contexts "
70 "to create for each BTL module. Larger numbers will improve "
71 "multi-threaded performance but may increase memory usage. "
72 "A good rule of thumb is one context per application thread "
73 "that will be calling into MPI. (default: 0 -- autoselect "
74 "based on the number of cores)", MCA_BASE_VAR_TYPE_INT,
75 NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
76 MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.num_contexts_per_module);
77
78 mca_btl_uct_component.disable_ucx_memory_hooks = true;
79 (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
80 "disable_ucx_memory_hooks", "Disable the munmap memory hook "
81 "inside UCX. These hooks are not necessary when using the "
82 "uct btl and tend to cause performance problems when using "
83 "multiple threads (default: true)", MCA_BASE_VAR_TYPE_BOOL,
84 NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
85 MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.disable_ucx_memory_hooks);
86
87
88 #if OPAL_C_HAVE__THREAD_LOCAL
89 mca_btl_uct_component.bind_threads_to_contexts = true;
90 (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
91 "bind_threads_to_contexts", "Bind threads to device contexts. "
92 "In general this should improve the multi-threaded performance "
93 "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL,
94 NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
95 MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts);
96 #endif
97
98
99 module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
100
101 return mca_btl_base_param_register (&mca_btl_uct_component.super.btl_version,
102 &module->super);
103 }
104
105 static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc)
106 {
107 ucm_vm_munmap(buf, length);
108 }
109
110 static int mca_btl_uct_component_open(void)
111 {
112 if (0 == mca_btl_uct_component.num_contexts_per_module) {
113
114
115 int core_count = 36;
116
117 (void) opal_hwloc_base_get_topology ();
118 core_count = hwloc_get_nbobjs_by_type (opal_hwloc_topology, HWLOC_OBJ_CORE);
119
120 if (core_count <= opal_process_info.num_local_peers || !opal_using_threads()) {
121
122
123 mca_btl_uct_component.num_contexts_per_module = 1;
124 } else {
125 mca_btl_uct_component.num_contexts_per_module = core_count / (opal_process_info.num_local_peers + 1);
126 }
127 }
128
129 if (mca_btl_uct_component.num_contexts_per_module > MCA_BTL_UCT_MAX_WORKERS) {
130 mca_btl_uct_component.num_contexts_per_module = MCA_BTL_UCT_MAX_WORKERS;
131 }
132
133 if (mca_btl_uct_component.disable_ucx_memory_hooks &&
134 ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
135 ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) &
136 opal_mem_hooks_support_level()))) {
137 ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
138 opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
139 }
140
141 return OPAL_SUCCESS;
142 }
143
144
145
146
147
148 static int mca_btl_uct_component_close(void)
149 {
150 if (mca_btl_uct_component.disable_ucx_memory_hooks) {
151 opal_mem_hooks_unregister_release (mca_btl_uct_mem_release_cb);
152 }
153
154 return OPAL_SUCCESS;
155 }
156
157 static size_t mca_btl_uct_tl_modex_size (mca_btl_uct_tl_t *tl)
158 {
159 const size_t size = strlen (tl->uct_tl_name) + 1;
160
161 if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) {
162
163 return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len + MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len) & ~3;
164 }
165
166 return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len) & ~3;
167 }
168
169 static size_t mca_btl_uct_module_modex_size (mca_btl_uct_module_t *module)
170 {
171 size_t modex_size = 4 + strlen (module->md_name) + 1;
172
173 if (module->rdma_tl) {
174 modex_size += mca_btl_uct_tl_modex_size (module->rdma_tl);
175 }
176
177 if (module->am_tl && module->am_tl != module->rdma_tl) {
178 modex_size += mca_btl_uct_tl_modex_size (module->am_tl);
179 }
180
181 if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) {
182 modex_size += mca_btl_uct_tl_modex_size (module->conn_tl);
183 }
184
185 return modex_size;
186 }
187
188 static size_t mca_btl_uct_tl_modex_pack (mca_btl_uct_tl_t *tl, uint8_t *modex_data)
189 {
190 mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0];
191 size_t modex_size = mca_btl_uct_tl_modex_size (tl);
192
193 *((uint32_t *) modex_data) = (uint32_t) modex_size;
194 modex_data += 4;
195
196 strcpy ((char *) modex_data, tl->uct_tl_name);
197 modex_data += strlen (tl->uct_tl_name) + 1;
198
199
200
201
202
203
204
205 if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) {
206 uct_iface_get_address (dev_context->uct_iface, (uct_iface_addr_t *) modex_data);
207 modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len;
208 }
209
210 uct_iface_get_device_address (dev_context->uct_iface, (uct_device_addr_t *) modex_data);
211 modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len;
212
213 return modex_size;
214 }
215
216 static int mca_btl_uct_modex_send (void)
217 {
218 size_t modex_size = sizeof (mca_btl_uct_modex_t);
219 mca_btl_uct_modex_t *modex;
220 uint8_t *modex_data;
221 int rc;
222
223 for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) {
224 modex_size += mca_btl_uct_module_modex_size (mca_btl_uct_component.modules[i]);
225 }
226
227 modex = alloca (modex_size);
228 modex_data = modex->data;
229
230 modex->module_count = mca_btl_uct_component.module_count;
231
232 for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) {
233 mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i];
234 size_t name_len = strlen (module->md_name);
235
236
237 *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size (module);
238
239 modex_data += 4;
240
241 strcpy ((char *) modex_data, module->md_name);
242 modex_data += name_len + 1;
243
244 if (module->rdma_tl) {
245 modex_data += mca_btl_uct_tl_modex_pack (module->rdma_tl, modex_data);
246 }
247
248 if (module->am_tl && module->am_tl != module->rdma_tl) {
249 modex_data += mca_btl_uct_tl_modex_pack (module->am_tl, modex_data);
250 }
251
252 if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) {
253 modex_data += mca_btl_uct_tl_modex_pack (module->conn_tl, modex_data);
254 }
255 }
256
257 OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size);
258 return rc;
259 }
260
261 static mca_btl_uct_module_t *mca_btl_uct_alloc_module (const char *md_name, mca_btl_uct_md_t *md,
262 size_t registration_size)
263 {
264 mca_btl_uct_module_t *module;
265 ucs_status_t ucs_status;
266
267 module = malloc (sizeof (*module));
268 if (NULL == module) {
269 return NULL;
270 }
271
272
273 *module = mca_btl_uct_module_template;
274
275 OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t);
276 OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t);
277 OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t);
278 OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t);
279 OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t);
280 OBJ_CONSTRUCT(&module->pending_frags, opal_list_t);
281 OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t);
282 OBJ_CONSTRUCT(&module->pending_connection_reqs, opal_fifo_t);
283
284 module->md = md;
285 module->md_name = strdup (md_name);
286 module->super.btl_registration_handle_size = registration_size;
287
288 ucs_status = ucs_async_context_create (UCS_ASYNC_MODE_THREAD, &module->ucs_async);
289 if (UCS_OK != ucs_status) {
290 BTL_VERBOSE(("Could not create a UCT async context"));
291 mca_btl_uct_finalize (&module->super);
292 return NULL;
293 }
294
295 return module;
296 }
297
298 ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags)
299 {
300 mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg;
301 mca_btl_uct_module_t *uct_btl = tl_context->uct_btl;
302 mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data;
303 mca_btl_active_message_callback_t *reg;
304 mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof (*header))},
305 .seg_len = length - sizeof (*header)};
306 mca_btl_uct_base_frag_t frag = {.base = {.des_segments = &seg, .des_segment_count = 1}};
307
308
309 tl_context->in_am_callback = true;
310
311 reg = mca_btl_base_active_message_trigger + header->data.tag;
312 reg->cbfunc (&uct_btl->super, header->data.tag, &frag.base, reg->cbdata);
313
314 tl_context->in_am_callback = false;
315
316 return UCS_OK;
317 }
318
319 static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc, char **allowed_ifaces)
320 {
321 mca_rcache_base_resources_t rcache_resources;
322 uct_tl_resource_desc_t *tl_desc;
323 mca_btl_uct_module_t *module;
324 uct_md_config_t *uct_config;
325 uct_md_attr_t md_attr;
326 mca_btl_uct_md_t *md;
327 bool found = false;
328 unsigned num_tls;
329 char *tmp;
330
331 if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) {
332 BTL_VERBOSE(("created the maximum number of allowable modules"));
333 return OPAL_ERR_NOT_AVAILABLE;
334 }
335
336 BTL_VERBOSE(("processing memory domain %s", md_desc->md_name));
337
338 for (int j = 0 ; allowed_ifaces[j] ; ++j) {
339 if (0 == strncmp (allowed_ifaces[j], md_desc->md_name, strlen (md_desc->md_name)) ||
340 0 == strcmp (allowed_ifaces[j], "all")) {
341 found = true;
342 break;
343 }
344 }
345
346 if (!found) {
347
348 return OPAL_SUCCESS;
349 }
350
351 md = OBJ_NEW(mca_btl_uct_md_t);
352
353 uct_md_config_read (md_desc->md_name, NULL, NULL, &uct_config);
354 uct_md_open (md_desc->md_name, uct_config, &md->uct_md);
355 uct_config_release (uct_config);
356
357 uct_md_query (md->uct_md, &md_attr);
358 uct_md_query_tl_resources (md->uct_md, &tl_desc, &num_tls);
359
360 module = mca_btl_uct_alloc_module (md_desc->md_name, md, md_attr.rkey_packed_size);
361 if (NULL == module) {
362 uct_release_tl_resource_list (tl_desc);
363 return OPAL_ERR_OUT_OF_RESOURCE;
364 }
365
366 (void) mca_btl_uct_query_tls (module, md, tl_desc, num_tls);
367
368 uct_release_tl_resource_list (tl_desc);
369
370
371
372 OBJ_RELEASE(md);
373
374 if (NULL == module->am_tl && NULL == module->rdma_tl) {
375 BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name));
376 mca_btl_uct_finalize (&module->super);
377 return OPAL_ERR_NOT_AVAILABLE;
378 }
379
380 mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module;
381
382
383
384
385 (void) opal_asprintf (&tmp, "uct.%s", module->md_name);
386
387 rcache_resources.cache_name = tmp;
388 rcache_resources.reg_data = (void *) module;
389 rcache_resources.sizeof_reg = sizeof (mca_btl_uct_reg_t) + module->super.btl_registration_handle_size;
390 rcache_resources.register_mem = mca_btl_uct_reg_mem;
391 rcache_resources.deregister_mem = mca_btl_uct_dereg_mem;
392
393 module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
394 free (tmp);
395 if (NULL == module->rcache) {
396
397 BTL_VERBOSE(("could not allocate a registration cache for this btl module"));
398 mca_btl_uct_finalize (&module->super);
399 return OPAL_ERROR;
400 }
401
402 return OPAL_SUCCESS;
403 }
404
405
406
407
408
409
410
411
412
413 static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, bool enable_progress_threads,
414 bool enable_mpi_threads)
415 {
416
417 struct mca_btl_base_module_t **base_modules;
418 uct_md_resource_desc_t *resources;
419 unsigned resource_count;
420 char **allowed_ifaces;
421 int rc;
422
423 BTL_VERBOSE(("initializing uct btl"));
424
425 if (NULL == mca_btl_uct_component.memory_domains || 0 == strlen (mca_btl_uct_component.memory_domains) ||
426 0 == strcmp (mca_btl_uct_component.memory_domains, "none")) {
427 BTL_VERBOSE(("no uct memory domains specified"));
428 return NULL;
429 }
430
431 allowed_ifaces = opal_argv_split (mca_btl_uct_component.memory_domains, ',');
432 if (NULL == allowed_ifaces) {
433 return NULL;
434 }
435
436 uct_query_md_resources (&resources, &resource_count);
437
438 mca_btl_uct_component.module_count = 0;
439
440
441 for (unsigned i = 0 ; i < resource_count ; ++i) {
442 rc = mca_btl_uct_component_process_uct_md (resources + i, allowed_ifaces);
443 if (OPAL_SUCCESS != rc) {
444 break;
445 }
446 }
447
448 opal_argv_free (allowed_ifaces);
449 uct_release_md_resource_list (resources);
450
451 mca_btl_uct_modex_send ();
452
453
454 base_modules = calloc (mca_btl_uct_component.module_count, sizeof (*base_modules));
455 if (NULL == base_modules) {
456 return NULL;
457 }
458
459 memcpy (base_modules, mca_btl_uct_component.modules, mca_btl_uct_component.module_count *
460 sizeof (mca_btl_uct_component.modules[0]));
461
462 *num_btl_modules = mca_btl_uct_component.module_count;
463
464 BTL_VERBOSE(("uct btl initialization complete. found %d suitable memory domains",
465 mca_btl_uct_component.module_count));
466
467 return base_modules;
468 }
469
470 static int mca_btl_uct_tl_progress (mca_btl_uct_tl_t *tl, int starting_index)
471 {
472 unsigned int ret = 0;
473
474 if (NULL == tl) {
475 return 0;
476 }
477
478 for (int j = 0 ; j < tl->max_device_contexts ; ++j) {
479 if (tl->uct_dev_contexts[j]) {
480 ret += mca_btl_uct_context_progress (tl->uct_dev_contexts[j]);
481 }
482 }
483
484 return ret;
485 }
486
487 static int mca_btl_uct_component_progress_pending (mca_btl_uct_module_t *uct_btl)
488 {
489 mca_btl_uct_base_frag_t *frag, *next;
490 size_t count;
491
492 if (0 == (count = opal_list_get_size (&uct_btl->pending_frags))) {
493 return 0;
494 }
495
496 OPAL_THREAD_LOCK(&uct_btl->lock);
497 OPAL_LIST_FOREACH_SAFE(frag, next, &uct_btl->pending_frags, mca_btl_uct_base_frag_t) {
498 if (!frag->ready) {
499 continue;
500 }
501
502 opal_list_remove_item (&uct_btl->pending_frags, (opal_list_item_t *) frag);
503
504 if (OPAL_SUCCESS > mca_btl_uct_send_frag (uct_btl, frag, false)) {
505 opal_list_prepend (&uct_btl->pending_frags, (opal_list_item_t *) frag);
506 }
507 }
508 OPAL_THREAD_UNLOCK(&uct_btl->lock);
509
510 return OPAL_SUCCESS;
511 }
512
513
514
515
516
517
518 static int mca_btl_uct_component_progress (void)
519 {
520 int starting_index = mca_btl_uct_get_context_index ();
521 unsigned ret = 0;
522
523 for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) {
524 mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i];
525
526
527
528 ret += mca_btl_uct_tl_progress (module->rdma_tl, starting_index);
529
530 if (module->am_tl != module->rdma_tl) {
531 ret += mca_btl_uct_tl_progress (module->am_tl, starting_index);
532 }
533
534 if (module->conn_tl) {
535 mca_btl_uct_pending_connection_request_t *request;
536
537 if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) {
538 ret += mca_btl_uct_tl_progress (module->conn_tl, 0);
539 }
540
541 while (NULL != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic (&module->pending_connection_reqs))) {
542 mca_btl_uct_process_connection_request (module, (mca_btl_uct_conn_req_t *) request->request_data);
543 OBJ_RELEASE(request);
544 }
545 }
546
547 if (0 != opal_list_get_size (&module->pending_frags)) {
548 mca_btl_uct_component_progress_pending (module);
549 }
550 }
551
552 return (int) ret;
553 }
554
555
556 mca_btl_uct_component_t mca_btl_uct_component = {
557 .super = {
558 .btl_version = {
559 MCA_BTL_DEFAULT_VERSION("uct"),
560 .mca_open_component = mca_btl_uct_component_open,
561 .mca_close_component = mca_btl_uct_component_close,
562 .mca_register_component_params = mca_btl_uct_component_register,
563 },
564 .btl_data = {
565
566 .param_field = MCA_BASE_METADATA_PARAM_NONE
567 },
568
569 .btl_init = mca_btl_uct_component_init,
570 .btl_progress = mca_btl_uct_component_progress,
571 }
572 };