This source file includes following definitions.
- validate_info
- mca_btl_ofi_component_register
- mca_btl_ofi_component_open
- mca_btl_ofi_component_close
- mca_btl_ofi_exit
- mca_btl_ofi_component_init
- mca_btl_ofi_init_device
- mca_btl_ofi_component_progress
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include "opal_config.h"
27
28 #include "opal/util/printf.h"
29
30 #include "opal/mca/btl/btl.h"
31 #include "opal/mca/btl/base/base.h"
32 #include "opal/mca/hwloc/base/base.h"
33
34 #include <string.h>
35
36 #include "btl_ofi.h"
37 #include "btl_ofi_endpoint.h"
38 #include "btl_ofi_rdma.h"
39 #include "btl_ofi_frag.h"
40
41 #define MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
42 #define MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS (FI_MSG)
43
44 #define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
45
46 static char *prov_include;
47 static char *ofi_progress_mode;
48 static bool disable_sep;
49 static int mca_btl_ofi_init_device(struct fi_info *info);
50
51
52
53 static int validate_info(struct fi_info *info, uint64_t required_caps)
54 {
55 int mr_mode;
56
57 BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
58
59
60 if ((info->caps & required_caps) != required_caps) {
61 BTL_VERBOSE(("unsupported caps"));
62 return OPAL_ERROR;
63 }
64
65
66 if (info->ep_attr->type != FI_EP_RDM) {
67 BTL_VERBOSE(("unsupported EP type"));
68 return OPAL_ERROR;
69 }
70
71 mr_mode = info->domain_attr->mr_mode;
72
73 if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
74 (mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
75 BTL_VERBOSE(("unsupported MR mode"));
76 return OPAL_ERROR;
77 }
78
79 if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
80 BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
81 return OPAL_ERROR;
82 }
83
84 BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
85 return OPAL_SUCCESS;
86 }
87
88
89 static int mca_btl_ofi_component_register(void)
90 {
91 char *msg;
92 mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;
93
94 opal_asprintf(&msg, "BTL OFI mode of operation. Valid values are: %d = One-Sided only, %d=Two-Sided only, "
95 "%d = Both one and two sided. BTL OFI is only optimized for one-sided communication",
96 MCA_BTL_OFI_MODE_ONE_SIDED,
97 MCA_BTL_OFI_MODE_TWO_SIDED,
98 MCA_BTL_OFI_MODE_FULL_SUPPORT);
99 if (NULL == msg) {
100 return OPAL_ERR_OUT_OF_RESOURCE;
101 }
102
103 mca_btl_ofi_component.mode = MCA_BTL_OFI_MODE_ONE_SIDED;
104 (void)mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
105 "mode",
106 msg,
107 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
108 OPAL_INFO_LVL_5,
109 MCA_BASE_VAR_SCOPE_READONLY,
110 &mca_btl_ofi_component.mode);
111
112
113
114
115 prov_include = NULL;
116 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
117 "provider_include",
118 "OFI provider that ofi btl will query for. This parameter only "
119 "accept ONE provider name. "
120 "(e.g., \"psm2\"; an empty value means that all providers will "
121 "be considered.",
122 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
123 OPAL_INFO_LVL_4,
124 MCA_BASE_VAR_SCOPE_READONLY,
125 &prov_include);
126
127 mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
128 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
129 "num_cq_read",
130 "Number of completion entries to read from a single cq_read. ",
131 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
132 OPAL_INFO_LVL_5,
133 MCA_BASE_VAR_SCOPE_READONLY,
134 &mca_btl_ofi_component.num_cqe_read);
135
136 ofi_progress_mode = "unspec";
137 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
138 "progress_mode",
139 "requested provider progress mode. [unspec, auto, manual]"
140 "(default: unspec)",
141 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
142 OPAL_INFO_LVL_5,
143 MCA_BASE_VAR_SCOPE_READONLY,
144 &ofi_progress_mode);
145
146 mca_btl_ofi_component.num_contexts_per_module = 1;
147 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
148 "num_contexts_per_module",
149 "number of communication context per module to create. "
150 "This should increase multithreaded performance but it is "
151 "advised that this number should be lower than total cores.",
152 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
153 OPAL_INFO_LVL_5,
154 MCA_BASE_VAR_SCOPE_READONLY,
155 &mca_btl_ofi_component.num_contexts_per_module);
156
157 disable_sep = false;
158 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
159 "disable_sep",
160 "force btl/ofi to never use scalable endpoint.",
161 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
162 OPAL_INFO_LVL_5,
163 MCA_BASE_VAR_SCOPE_READONLY,
164 &disable_sep);
165
166 mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD;
167 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
168 "progress_threshold",
169 "number of outstanding operation before btl will progress "
170 "automatically. Tuning this might improve performance on "
171 "certain type of application.",
172 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
173 OPAL_INFO_LVL_5,
174 MCA_BASE_VAR_SCOPE_READONLY,
175 &mca_btl_ofi_component.progress_threshold);
176
177 mca_btl_ofi_component.rd_num = MCA_BTL_OFI_DEFAULT_RD_NUM;
178 (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
179 "rd_num",
180 "Number of receive descriptor posted per context.",
181 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
182 OPAL_INFO_LVL_5,
183 MCA_BASE_VAR_SCOPE_READONLY,
184 &mca_btl_ofi_component.rd_num);
185
186
187
188 module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
189
190 return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
191 &module->super);
192 }
193
194 static int mca_btl_ofi_component_open(void)
195 {
196 mca_btl_ofi_component.module_count = 0;
197 return OPAL_SUCCESS;
198 }
199
200
201
202
203 static int mca_btl_ofi_component_close(void)
204 {
205
206 sleep(1);
207 return OPAL_SUCCESS;
208 }
209
210 void mca_btl_ofi_exit(void)
211 {
212 BTL_ERROR(("BTL OFI will now abort."));
213 exit(1);
214 }
215
216
217
218
219
220
221
222 static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads,
223 bool enable_mpi_threads)
224 {
225
226 int rc;
227 uint64_t progress_mode;
228 unsigned resource_count = 0;
229 struct mca_btl_base_module_t **base_modules;
230
231 BTL_VERBOSE(("initializing ofi btl"));
232
233
234 uint32_t libfabric_api;
235 libfabric_api = fi_version();
236
237
238 if (libfabric_api < FI_VERSION(1, 5)) {
239 BTL_VERBOSE(("ofi btl disqualified because OFI version < 1.5."));
240 return NULL;
241 }
242
243 struct fi_info *info, *info_list;
244 struct fi_info hints = {0};
245 struct fi_ep_attr ep_attr = {0};
246 struct fi_rx_attr rx_attr = {0};
247 struct fi_tx_attr tx_attr = {0};
248 struct fi_fabric_attr fabric_attr = {0};
249 struct fi_domain_attr domain_attr = {0};
250 uint64_t required_caps;
251
252 switch (mca_btl_ofi_component.mode) {
253
254 case MCA_BTL_OFI_MODE_TWO_SIDED:
255 mca_btl_ofi_component.two_sided_enabled = true;
256 required_caps = MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
257 break;
258
259 case MCA_BTL_OFI_MODE_FULL_SUPPORT:
260 mca_btl_ofi_component.two_sided_enabled = true;
261 required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS |
262 MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
263 break;
264
265 default:
266
267 required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS;
268 break;
269 }
270
271
272 fabric_attr.prov_name = prov_include;
273
274 domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
275
276
277 if (!strcmp(ofi_progress_mode, "auto")) {
278 progress_mode = FI_PROGRESS_AUTO;
279 } else if (!strcmp(ofi_progress_mode, "manual")) {
280 progress_mode = FI_PROGRESS_MANUAL;
281 } else {
282 progress_mode = FI_PROGRESS_UNSPEC;
283 }
284
285 domain_attr.control_progress = progress_mode;
286 domain_attr.data_progress = progress_mode;
287
288
289 ep_attr.type = FI_EP_RDM;
290
291
292
293 hints.caps = required_caps;
294 hints.mode = FI_CONTEXT;
295
296
297 hints.mode = FI_CONTEXT;
298
299 hints.fabric_attr = &fabric_attr;
300 hints.domain_attr = &domain_attr;
301 hints.ep_attr = &ep_attr;
302 hints.tx_attr = &tx_attr;
303 hints.rx_attr = &rx_attr;
304
305
306 tx_attr.iov_limit = 1;
307 rx_attr.iov_limit = 1;
308
309 tx_attr.op_flags = FI_DELIVERY_COMPLETE;
310
311 mca_btl_ofi_component.module_count = 0;
312
313
314 rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list);
315 if (0 != rc) {
316 BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
317 return NULL;
318 }
319
320
321 info = info_list;
322 while(info) {
323 resource_count++;
324 info = info->next;
325 }
326 BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count));
327
328 info = info_list;
329
330 while(info) {
331 rc = validate_info(info, required_caps);
332 if (OPAL_SUCCESS == rc) {
333
334
335 rc = mca_btl_ofi_init_device(info);
336 if (OPAL_SUCCESS == rc)
337 break;
338 }
339 info = info->next;
340 }
341
342
343 fi_freeinfo(info_list);
344
345
346 base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));
347 if (NULL == base_modules) {
348 return NULL;
349 }
350
351 memcpy(base_modules, mca_btl_ofi_component.modules,
352 mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0]));
353
354 BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports",
355 mca_btl_ofi_component.module_count));
356
357 *num_btl_modules = mca_btl_ofi_component.module_count;
358
359 return base_modules;
360 }
361
362 static int mca_btl_ofi_init_device(struct fi_info *info)
363 {
364 int rc;
365 int *module_count = &mca_btl_ofi_component.module_count;
366 size_t namelen;
367 size_t num_contexts_to_create;
368
369 char *linux_device_name;
370 char ep_name[FI_NAME_MAX];
371
372 struct fi_info *ofi_info;
373 struct fi_ep_attr *ep_attr;
374 struct fi_domain_attr *domain_attr;
375 struct fi_av_attr av_attr = {0};
376 struct fid_fabric *fabric = NULL;
377 struct fid_domain *domain = NULL;
378 struct fid_ep *ep = NULL;
379 struct fid_av *av = NULL;
380
381 mca_btl_ofi_module_t *module;
382
383 module = mca_btl_ofi_module_alloc(mca_btl_ofi_component.mode);
384 if (NULL == module) {
385 BTL_VERBOSE(("failed allocating ofi module"));
386 goto fail;
387 }
388
389
390
391 module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
392
393
394 ofi_info = fi_dupinfo(info);
395 ep_attr = ofi_info->ep_attr;
396 domain_attr = ofi_info->domain_attr;
397
398 linux_device_name = info->domain_attr->name;
399 BTL_VERBOSE(("initializing dev:%s provider:%s",
400 linux_device_name,
401 info->fabric_attr->prov_name));
402
403
404 rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
405 if (0 != rc) {
406 BTL_VERBOSE(("%s failed fi_fabric with err=%s",
407 linux_device_name,
408 fi_strerror(-rc)
409 ));
410 goto fail;
411 }
412
413
414 rc = fi_domain(fabric, ofi_info, &domain, NULL);
415 if (0 != rc) {
416 BTL_VERBOSE(("%s failed fi_domain with err=%s",
417 linux_device_name,
418 fi_strerror(-rc)
419 ));
420 goto fail;
421 }
422
423
424 av_attr.type = FI_AV_MAP;
425 rc = fi_av_open(domain, &av_attr, &av, NULL);
426 if (0 != rc) {
427 BTL_VERBOSE(("%s failed fi_av_open with err=%s",
428 linux_device_name,
429 fi_strerror(-rc)
430 ));
431 goto fail;
432 }
433
434 num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module;
435
436
437 if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) {
438
439 BTL_VERBOSE(("btl/ofi using scalable endpoint."));
440
441 if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) {
442 BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)",
443 module->num_contexts,
444 domain_attr->max_ep_tx_ctx));
445 goto fail;
446 }
447
448
449 ep_attr->tx_ctx_cnt = num_contexts_to_create;
450 ep_attr->rx_ctx_cnt = num_contexts_to_create;
451
452
453 rc = fi_scalable_ep(domain, ofi_info, &ep, NULL);
454 if (0 != rc) {
455 BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s",
456 linux_device_name,
457 fi_strerror(-rc)
458 ));
459 goto fail;
460 }
461
462 module->num_contexts = num_contexts_to_create;
463 module->is_scalable_ep = true;
464
465
466 module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info,
467 domain, ep, av,
468 num_contexts_to_create);
469
470 } else {
471
472 if (num_contexts_to_create > 1) {
473 BTL_ERROR(("cannot create %zu contexts as the provider does not support "
474 "scalable endpoint. Falling back to single context endpoint.",
475 num_contexts_to_create));
476 }
477
478 BTL_VERBOSE(("btl/ofi using normal endpoint."));
479
480 rc = fi_endpoint(domain, ofi_info, &ep, NULL);
481 if (0 != rc) {
482 BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
483 linux_device_name,
484 fi_strerror(-rc)
485 ));
486 goto fail;
487 }
488
489 module->num_contexts = 1;
490 module->is_scalable_ep = false;
491
492
493 module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info,
494 domain, ep, av);
495 }
496
497 if (NULL == module->contexts) {
498
499 goto fail;
500 }
501
502
503 rc = fi_enable(ep);
504 if (0 != rc) {
505 BTL_VERBOSE(("%s failed fi_enable with err=%s",
506 linux_device_name,
507 fi_strerror(-rc)
508 ));
509 goto fail;
510 }
511
512
513
514 module->fabric_info = ofi_info;
515 module->fabric = fabric;
516 module->domain = domain;
517 module->av = av;
518 module->ofi_endpoint = ep;
519 module->linux_device_name = linux_device_name;
520 module->outstanding_rdma = 0;
521 module->use_virt_addr = false;
522
523 if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC ||
524 ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) {
525 module->use_virt_addr = true;
526 }
527
528
529 mca_btl_ofi_rcache_init(module);
530
531
532 OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
533 OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
534 OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t);
535
536 rc = opal_hash_table_init (&module->id_to_endpoint, 512);
537 if (OPAL_SUCCESS != rc) {
538 BTL_ERROR(("error initializing hash table."));
539 goto fail;
540 }
541
542
543 namelen = sizeof(ep_name);
544 rc = fi_getname((fid_t)ep, &ep_name[0], &namelen);
545 if (0 != rc) {
546 BTL_VERBOSE(("%s failed fi_getname with err=%s",
547 linux_device_name,
548 fi_strerror(-rc)
549 ));
550 goto fail;
551 }
552
553
554
555 if (TWO_SIDED_ENABLED) {
556
557
558 for (int i=0; i < module->num_contexts; i++) {
559 rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) module,
560 &module->contexts[i],
561 mca_btl_ofi_component.rd_num);
562 if (OPAL_SUCCESS != rc) {
563 goto fail;
564 }
565 }
566 }
567
568
569 OPAL_MODEX_SEND(rc,
570 OPAL_PMIX_GLOBAL,
571 &mca_btl_ofi_component.super.btl_version,
572 &ep_name,
573 namelen);
574 mca_btl_ofi_component.namelen = namelen;
575
576
577 mca_btl_ofi_component.modules[(*module_count)++] = module;
578
579 return OPAL_SUCCESS;
580
581 fail:
582
583
584
585
586 for (int i=0; i < module->num_contexts; i++) {
587 mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
588 }
589 free(module->contexts);
590
591 if (NULL != av) {
592 fi_close(&av->fid);
593 }
594
595 if (NULL != ep) {
596 fi_close(&ep->fid);
597 }
598
599 if (NULL != domain) {
600 fi_close(&domain->fid);
601 }
602
603 if (NULL != fabric) {
604 fi_close(&fabric->fid);
605 }
606 free(module);
607
608
609 return OPAL_ERR_OUT_OF_RESOURCE;
610 }
611
612
613
614
615
616
617 static int mca_btl_ofi_component_progress (void)
618 {
619 int events = 0;
620 mca_btl_ofi_context_t *context;
621
622 for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
623 mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
624
625
626 context = get_ofi_context(module);
627
628 if (mca_btl_ofi_context_trylock(context)) {
629 events += mca_btl_ofi_context_progress(context);
630 mca_btl_ofi_context_unlock(context);
631 }
632
633
634 if (events == 0) {
635 for (int j = 0 ; j < module->num_contexts ; j++ ) {
636
637 context = get_ofi_context_rr(module);
638
639 if (mca_btl_ofi_context_trylock(context)) {
640 events += mca_btl_ofi_context_progress(context);
641 mca_btl_ofi_context_unlock(context);
642 }
643
644
645
646 if (events > 0) {
647 break;
648 }
649 }
650 }
651 }
652
653 return events;
654 }
655
656
657 mca_btl_ofi_component_t mca_btl_ofi_component = {
658 .super = {
659 .btl_version = {
660 MCA_BTL_DEFAULT_VERSION("ofi"),
661 .mca_open_component = mca_btl_ofi_component_open,
662 .mca_close_component = mca_btl_ofi_component_close,
663 .mca_register_component_params = mca_btl_ofi_component_register,
664 },
665 .btl_data = {
666
667 .param_field = MCA_BASE_METADATA_PARAM_NONE
668 },
669
670 .btl_init = mca_btl_ofi_component_init,
671 .btl_progress = mca_btl_ofi_component_progress,
672 },
673 };