This source file includes following definitions.
- ompi_mtl_ofi_component_register
- ompi_mtl_ofi_component_open
- ompi_mtl_ofi_component_query
- ompi_mtl_ofi_component_close
- ompi_mtl_ofi_progress_no_inline
- is_in_list
- select_ofi_provider
- ompi_mtl_ofi_check_fi_remote_cq_data
- ompi_mtl_ofi_define_tag_mode
- ompi_mtl_ofi_init_sep
- ompi_mtl_ofi_init_regular_ep
- ompi_mtl_ofi_component_init
- ompi_mtl_ofi_finalize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #include "mtl_ofi.h"
17 #include "opal/util/argv.h"
18 #include "opal/util/printf.h"
19
20 static int ompi_mtl_ofi_component_open(void);
21 static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
22 static int ompi_mtl_ofi_component_close(void);
23 static int ompi_mtl_ofi_component_register(void);
24
25 static mca_mtl_base_module_t*
26 ompi_mtl_ofi_component_init(bool enable_progress_threads,
27 bool enable_mpi_threads);
28
29 static int param_priority;
30 static char *prov_include;
31 static char *prov_exclude;
32 static int control_progress;
33 static int data_progress;
34 static int av_type;
35 static int ofi_tag_mode;
36
37 #if OPAL_HAVE_THREAD_LOCAL
38 opal_thread_local int per_thread_ctx;
39 opal_thread_local struct fi_cq_tagged_entry wc[MTL_OFI_MAX_PROG_EVENT_COUNT];
40 #endif
41
42
43
44
45
46 enum {
47 MTL_OFI_PROG_AUTO=1,
48 MTL_OFI_PROG_MANUAL,
49 MTL_OFI_PROG_UNSPEC,
50 };
51
52 mca_base_var_enum_value_t control_prog_type[] = {
53 {MTL_OFI_PROG_AUTO, "auto"},
54 {MTL_OFI_PROG_MANUAL, "manual"},
55 {MTL_OFI_PROG_UNSPEC, "unspec"},
56 {0, NULL}
57 };
58
59 mca_base_var_enum_value_t data_prog_type[] = {
60 {MTL_OFI_PROG_AUTO, "auto"},
61 {MTL_OFI_PROG_MANUAL, "manual"},
62 {MTL_OFI_PROG_UNSPEC, "unspec"},
63 {0, NULL}
64 };
65
66 enum {
67 MTL_OFI_AV_MAP=1,
68 MTL_OFI_AV_TABLE,
69 MTL_OFI_AV_UNKNOWN,
70 };
71
72 mca_base_var_enum_value_t av_table_type[] = {
73 {MTL_OFI_AV_MAP, "map"},
74 {MTL_OFI_AV_TABLE, "table"},
75 {0, NULL}
76 };
77
78 enum {
79 MTL_OFI_TAG_AUTO=1,
80 MTL_OFI_TAG_1,
81 MTL_OFI_TAG_2,
82 MTL_OFI_TAG_FULL,
83 };
84
85 mca_base_var_enum_value_t ofi_tag_mode_type[] = {
86 {MTL_OFI_TAG_AUTO, "auto"},
87 {MTL_OFI_TAG_1, "ofi_tag_1"},
88 {MTL_OFI_TAG_2, "ofi_tag_2"},
89 {MTL_OFI_TAG_FULL, "ofi_tag_full"},
90 {0, NULL}
91 };
92
93 mca_mtl_ofi_component_t mca_mtl_ofi_component = {
94 {
95
96
97
98
99 .mtl_version = {
100 MCA_MTL_BASE_VERSION_2_0_0,
101
102 .mca_component_name = "ofi",
103 OFI_COMPAT_MCA_VERSION,
104 .mca_open_component = ompi_mtl_ofi_component_open,
105 .mca_close_component = ompi_mtl_ofi_component_close,
106 .mca_query_component = ompi_mtl_ofi_component_query,
107 .mca_register_component_params = ompi_mtl_ofi_component_register,
108 },
109 .mtl_data = {
110
111 MCA_BASE_METADATA_PARAM_NONE
112 },
113
114 .mtl_init = ompi_mtl_ofi_component_init,
115 }
116 };
117
118 static int
119 ompi_mtl_ofi_component_register(void)
120 {
121 int ret;
122 mca_base_var_enum_t *new_enum = NULL;
123 char *desc;
124
125 param_priority = 25;
126 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
127 "priority", "Priority of the OFI MTL component",
128 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
129 OPAL_INFO_LVL_9,
130 MCA_BASE_VAR_SCOPE_READONLY,
131 ¶m_priority);
132
133 prov_include = NULL;
134 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
135 "provider_include",
136 "Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
137 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
138 OPAL_INFO_LVL_1,
139 MCA_BASE_VAR_SCOPE_READONLY,
140 &prov_include);
141
142 prov_exclude = "shm,sockets,tcp,udp,rstream";
143 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
144 "provider_exclude",
145 "Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
146 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
147 OPAL_INFO_LVL_1,
148 MCA_BASE_VAR_SCOPE_READONLY,
149 &prov_exclude);
150
151 ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
152 opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
153 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
154 "progress_event_cnt",
155 desc,
156 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
157 OPAL_INFO_LVL_6,
158 MCA_BASE_VAR_SCOPE_READONLY,
159 &ompi_mtl_ofi.ofi_progress_event_count);
160
161 free(desc);
162
163 ret = mca_base_var_enum_create ("ofi_tag_mode_type", ofi_tag_mode_type , &new_enum);
164 if (OPAL_SUCCESS != ret) {
165 return ret;
166 }
167
168 ofi_tag_mode = MTL_OFI_TAG_AUTO;
169 opal_asprintf(&desc, "Mode specifying how many bits to use for various MPI values in OFI/Libfabric"
170 " communications. Some Libfabric provider network types can support most of Open MPI"
171 " needs; others can only supply a limited number of bits, which then must be split"
172 " across the MPI communicator ID, MPI source rank, and MPI tag. Three different"
173 " splitting schemes are available: ofi_tag_full (%d bits for the communicator, %d bits"
174 " for the source rank, and %d bits for the tag), ofi_tag_1 (%d bits for the communicator"
175 ", %d bits source rank, %d bits tag), ofi_tag_2 (%d bits for the communicator"
176 ", %d bits source rank, %d bits tag). By default, this MCA variable is set to \"auto\","
177 " which will first try to use ofi_tag_full, and if that fails, fall back to ofi_tag_1.",
178 MTL_OFI_CID_BIT_COUNT_DATA, 32, MTL_OFI_TAG_BIT_COUNT_DATA,
179 MTL_OFI_CID_BIT_COUNT_1, MTL_OFI_SOURCE_BIT_COUNT_1, MTL_OFI_TAG_BIT_COUNT_1,
180 MTL_OFI_CID_BIT_COUNT_2, MTL_OFI_SOURCE_BIT_COUNT_2, MTL_OFI_TAG_BIT_COUNT_2);
181
182 mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version,
183 "tag_mode",
184 desc,
185 MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
186 OPAL_INFO_LVL_6,
187 MCA_BASE_VAR_SCOPE_READONLY,
188 &ofi_tag_mode);
189
190 free(desc);
191 OBJ_RELEASE(new_enum);
192
193 ret = mca_base_var_enum_create ("control_prog_type", control_prog_type, &new_enum);
194 if (OPAL_SUCCESS != ret) {
195 return ret;
196 }
197
198 control_progress = MTL_OFI_PROG_UNSPEC;
199 mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version,
200 "control_progress",
201 "Specify control progress model (default: unspecificed, use provider's default). Set to auto or manual for auto or manual progress respectively.",
202 MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
203 OPAL_INFO_LVL_3,
204 MCA_BASE_VAR_SCOPE_READONLY,
205 &control_progress);
206 OBJ_RELEASE(new_enum);
207
208 ret = mca_base_var_enum_create ("data_prog_type", data_prog_type, &new_enum);
209 if (OPAL_SUCCESS != ret) {
210 return ret;
211 }
212
213 data_progress = MTL_OFI_PROG_UNSPEC;
214 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
215 "data_progress",
216 "Specify data progress model (default: unspecified, use provider's default). Set to auto or manual for auto or manual progress respectively.",
217 MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
218 OPAL_INFO_LVL_3,
219 MCA_BASE_VAR_SCOPE_READONLY,
220 &data_progress);
221 OBJ_RELEASE(new_enum);
222
223 ret = mca_base_var_enum_create ("av_type", av_table_type, &new_enum);
224 if (OPAL_SUCCESS != ret) {
225 return ret;
226 }
227
228 av_type = MTL_OFI_AV_MAP;
229 mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version,
230 "av",
231 "Specify AV type to use (default: map). Set to table for FI_AV_TABLE AV type.",
232 MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
233 OPAL_INFO_LVL_3,
234 MCA_BASE_VAR_SCOPE_READONLY,
235 &av_type);
236 OBJ_RELEASE(new_enum);
237
238 ompi_mtl_ofi.enable_sep = 0;
239 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
240 "enable_sep",
241 "Enable SEP feature",
242 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
243 OPAL_INFO_LVL_3,
244 MCA_BASE_VAR_SCOPE_READONLY,
245 &ompi_mtl_ofi.enable_sep);
246
247 ompi_mtl_ofi.thread_grouping = 0;
248 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
249 "thread_grouping",
250 "Enable/Disable Thread Grouping feature",
251 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
252 OPAL_INFO_LVL_3,
253 MCA_BASE_VAR_SCOPE_READONLY,
254 &ompi_mtl_ofi.thread_grouping);
255
256
257
258
259
260
261 ompi_mtl_ofi.num_ofi_contexts = 1;
262 mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
263 "num_ctxts",
264 "Specify number of OFI contexts to create",
265 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
266 OPAL_INFO_LVL_4,
267 MCA_BASE_VAR_SCOPE_READONLY,
268 &ompi_mtl_ofi.num_ofi_contexts);
269
270 return OMPI_SUCCESS;
271 }
272
273
274
275 static int
276 ompi_mtl_ofi_component_open(void)
277 {
278 ompi_mtl_ofi.base.mtl_request_size =
279 sizeof(ompi_mtl_ofi_request_t) - sizeof(struct mca_mtl_request_t);
280
281 ompi_mtl_ofi.domain = NULL;
282 ompi_mtl_ofi.av = NULL;
283 ompi_mtl_ofi.sep = NULL;
284
285
286
287
288
289 if (OMPI_SUCCESS !=
290 mca_base_var_check_exclusive("ompi",
291 mca_mtl_ofi_component.super.mtl_version.mca_type_name,
292 mca_mtl_ofi_component.super.mtl_version.mca_component_name,
293 "provider_include",
294 mca_mtl_ofi_component.super.mtl_version.mca_type_name,
295 mca_mtl_ofi_component.super.mtl_version.mca_component_name,
296 "provider_exclude")) {
297 return OMPI_ERR_NOT_AVAILABLE;
298 }
299
300 return OMPI_SUCCESS;
301 }
302
303 static int
304 ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
305 {
306 *priority = param_priority;
307 *module = (mca_base_module_t *)&ompi_mtl_ofi.base;
308 return OMPI_SUCCESS;
309 }
310
311 static int
312 ompi_mtl_ofi_component_close(void)
313 {
314 return OMPI_SUCCESS;
315 }
316
317 int
318 ompi_mtl_ofi_progress_no_inline(void)
319 {
320 return ompi_mtl_ofi_progress();
321 }
322
323 static int
324 is_in_list(char **list, char *item)
325 {
326 int i = 0;
327
328 if ((NULL == list) || (NULL == item)) {
329 return 0;
330 }
331
332 while (NULL != list[i]) {
333 if (0 == strncmp(item, list[i], strlen(list[i]))) {
334 return 1;
335 } else {
336 i++;
337 }
338 }
339
340 return 0;
341 }
342
343 static struct fi_info*
344 select_ofi_provider(struct fi_info *providers)
345 {
346 char **include_list = NULL;
347 char **exclude_list = NULL;
348 struct fi_info *prov = providers;
349
350 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
351 "%s:%d: mtl:ofi:provider_include = \"%s\"\n",
352 __FILE__, __LINE__, prov_include);
353 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
354 "%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
355 __FILE__, __LINE__, prov_exclude);
356
357 if (NULL != prov_include) {
358 include_list = opal_argv_split(prov_include, ',');
359 while ((NULL != prov) &&
360 (!is_in_list(include_list, prov->fabric_attr->prov_name))) {
361 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
362 "%s:%d: mtl:ofi: \"%s\" not in include list\n",
363 __FILE__, __LINE__,
364 prov->fabric_attr->prov_name);
365 prov = prov->next;
366 }
367 } else if (NULL != prov_exclude) {
368 exclude_list = opal_argv_split(prov_exclude, ',');
369 while ((NULL != prov) &&
370 (is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
371 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
372 "%s:%d: mtl:ofi: \"%s\" in exclude list\n",
373 __FILE__, __LINE__,
374 prov->fabric_attr->prov_name);
375 prov = prov->next;
376 }
377 }
378
379 opal_argv_free(include_list);
380 opal_argv_free(exclude_list);
381
382 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
383 "%s:%d: mtl:ofi:prov: %s\n",
384 __FILE__, __LINE__,
385 (prov ? prov->fabric_attr->prov_name : "none"));
386
387 return prov;
388 }
389
390
391
392
393 static int
394 ompi_mtl_ofi_check_fi_remote_cq_data(int fi_version,
395 struct fi_info *hints,
396 struct fi_info *provider,
397 struct fi_info **prov_cq_data)
398 {
399 int ret;
400 char *provider_name;
401 struct fi_info *hints_dup;
402 hints_dup = fi_dupinfo(hints);
403
404 provider_name = strdup(provider->fabric_attr->prov_name);
405 hints_dup->fabric_attr->prov_name = provider_name;
406 hints_dup->caps |= FI_TAGGED | FI_DIRECTED_RECV;
407
408 hints_dup->domain_attr->cq_data_size = sizeof(int);
409 ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, prov_cq_data);
410
411 if ((0 != ret) && (-FI_ENODATA != ret)) {
412 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
413 "fi_getinfo",
414 ompi_process_info.nodename, __FILE__, __LINE__,
415 fi_strerror(-ret), -ret);
416 return ret;
417 } else if (-FI_ENODATA == ret) {
418
419 prov_cq_data = NULL;
420 }
421
422 fi_freeinfo(hints_dup);
423 return OMPI_SUCCESS;
424 }
425
426 static void
427 ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
428 switch (ofi_tag_mode) {
429 case MTL_OFI_TAG_1:
430 *bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_1;
431 ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1);
432
433 ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_1;
434 ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_1;
435 ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_1;
436
437 ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_1;
438 ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_1;
439
440 ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_1;
441 ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_1;
442 ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_1;
443 break;
444 case MTL_OFI_TAG_2:
445 *bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_2;
446 ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_2 - 1)) - 1);
447
448 ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_2;
449 ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_2;
450 ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_2;
451
452 ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_2;
453 ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_2;
454
455 ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_2;
456 ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_2;
457 ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_2;
458 break;
459 default:
460 *bits_for_cid = (int) MTL_OFI_CID_BIT_COUNT_DATA;
461 ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_DATA - 1)) - 1);
462
463 ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_DATA;
464
465 ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_DATA;
466 ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_DATA;
467 ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_DATA;
468 }
469 }
470
471 #define MTL_OFI_ALLOC_COMM_TO_CONTEXT(arr_size) \
472 do { \
473 ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
474 if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
475 opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
476 "%s:%d: alloc of comm_to_context array failed: %s\n",\
477 __FILE__, __LINE__, strerror(errno)); \
478 return ret; \
479 } \
480 } while (0);
481
482 #define MTL_OFI_ALLOC_OFI_CTXTS() \
483 do { \
484 ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
485 sizeof(mca_mtl_ofi_context_t)); \
486 if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
487 opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
488 "%s:%d: alloc of ofi_ctxt array failed: %s\n", \
489 __FILE__, __LINE__, strerror(errno)); \
490 return ret; \
491 } \
492 } while(0);
493
494 static int ompi_mtl_ofi_init_sep(struct fi_info *prov, int universe_size)
495 {
496 int ret = OMPI_SUCCESS, num_ofi_ctxts;
497 struct fi_av_attr av_attr = {0};
498
499 prov->ep_attr->tx_ctx_cnt = prov->ep_attr->rx_ctx_cnt =
500 ompi_mtl_ofi.num_ofi_contexts;
501
502 ret = fi_scalable_ep(ompi_mtl_ofi.domain, prov, &ompi_mtl_ofi.sep, NULL);
503 if (0 != ret) {
504 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
505 "fi_scalable_ep",
506 ompi_process_info.nodename, __FILE__, __LINE__,
507 fi_strerror(-ret), -ret);
508 return ret;
509 }
510
511 ompi_mtl_ofi.rx_ctx_bits = 0;
512 while (ompi_mtl_ofi.num_ofi_contexts >> ++ompi_mtl_ofi.rx_ctx_bits);
513
514 av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
515 av_attr.rx_ctx_bits = ompi_mtl_ofi.rx_ctx_bits;
516 av_attr.count = ompi_mtl_ofi.num_ofi_contexts * universe_size;
517 ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
518
519 if (0 != ret) {
520 MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
521 return ret;
522 }
523
524 ret = fi_scalable_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.av, 0);
525 if (0 != ret) {
526 MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
527 return ret;
528 }
529
530
531
532
533
534
535
536 num_ofi_ctxts = ompi_mtl_ofi.thread_grouping ?
537 ompi_mtl_ofi.num_ofi_contexts + 2 : 1;
538 MTL_OFI_ALLOC_COMM_TO_CONTEXT(num_ofi_ctxts);
539
540 ompi_mtl_ofi.total_ctxts_used = 0;
541 ompi_mtl_ofi.threshold_comm_context_id = 0;
542
543
544 MTL_OFI_ALLOC_OFI_CTXTS();
545
546 return ret;
547 }
548
549 static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov, int universe_size)
550 {
551 int ret = OMPI_SUCCESS;
552 struct fi_av_attr av_attr = {0};
553 struct fi_cq_attr cq_attr = {0};
554 cq_attr.format = FI_CQ_FORMAT_TAGGED;
555 cq_attr.size = ompi_mtl_ofi.ofi_progress_event_count;
556
557
558 ompi_mtl_ofi.num_ofi_contexts = 1;
559 ret = fi_endpoint(ompi_mtl_ofi.domain,
560 prov,
561 &ompi_mtl_ofi.sep,
562 NULL);
563 if (0 != ret) {
564 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
565 "fi_endpoint",
566 ompi_process_info.nodename, __FILE__, __LINE__,
567 fi_strerror(-ret), -ret);
568 return ret;
569 }
570
571
572
573
574
575
576 av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
577 av_attr.count = universe_size;
578 ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
579 if (ret) {
580 MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
581 return ret;
582 }
583
584 ret = fi_ep_bind(ompi_mtl_ofi.sep,
585 (fid_t)ompi_mtl_ofi.av,
586 0);
587 if (0 != ret) {
588 MTL_OFI_LOG_FI_ERR(ret, "fi_bind AV-EP failed");
589 return ret;
590 }
591
592 MTL_OFI_ALLOC_COMM_TO_CONTEXT(1);
593
594
595 MTL_OFI_ALLOC_OFI_CTXTS();
596
597 ompi_mtl_ofi.ofi_ctxt[0].tx_ep = ompi_mtl_ofi.sep;
598 ompi_mtl_ofi.ofi_ctxt[0].rx_ep = ompi_mtl_ofi.sep;
599
600 ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.ofi_ctxt[0].cq, NULL);
601 if (ret) {
602 MTL_OFI_LOG_FI_ERR(ret, "fi_cq_open failed");
603 return ret;
604 }
605
606
607 ret = fi_ep_bind(ompi_mtl_ofi.sep, (fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq,
608 FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
609 if (0 != ret) {
610 MTL_OFI_LOG_FI_ERR(ret, "fi_bind CQ-EP failed");
611 return ret;
612 }
613
614 return ret;
615 }
616
617 static mca_mtl_base_module_t*
618 ompi_mtl_ofi_component_init(bool enable_progress_threads,
619 bool enable_mpi_threads)
620 {
621 int ret, fi_version;
622 int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
623 int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
624 struct fi_info *hints;
625 struct fi_info *providers = NULL;
626 struct fi_info *prov = NULL;
627 struct fi_info *prov_cq_data = NULL;
628 char ep_name[FI_NAME_MAX] = {0};
629 size_t namelen;
630 int universe_size;
631 char *univ_size_str;
632
633
634
635
636
637
638
639
640
641
642
643 hints = fi_allocinfo();
644 if (!hints) {
645 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
646 "%s:%d: Could not allocate fi_info\n",
647 __FILE__, __LINE__);
648 goto error;
649 }
650 hints->mode = FI_CONTEXT;
651 hints->ep_attr->type = FI_EP_RDM;
652 hints->caps = FI_TAGGED;
653 hints->tx_attr->msg_order = FI_ORDER_SAS;
654 hints->rx_attr->msg_order = FI_ORDER_SAS;
655 hints->rx_attr->op_flags = FI_COMPLETION;
656 hints->tx_attr->op_flags = FI_COMPLETION;
657
658 if (enable_mpi_threads) {
659 ompi_mtl_ofi.mpi_thread_multiple = true;
660 hints->domain_attr->threading = FI_THREAD_SAFE;
661 } else {
662 ompi_mtl_ofi.mpi_thread_multiple = false;
663 hints->domain_attr->threading = FI_THREAD_DOMAIN;
664 }
665
666 switch (control_progress) {
667 case MTL_OFI_PROG_AUTO:
668 hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
669 break;
670 case MTL_OFI_PROG_MANUAL:
671 hints->domain_attr->control_progress = FI_PROGRESS_MANUAL;
672 break;
673 default:
674 hints->domain_attr->control_progress = FI_PROGRESS_UNSPEC;
675 }
676
677 switch (data_progress) {
678 case MTL_OFI_PROG_AUTO:
679 hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
680 break;
681 case MTL_OFI_PROG_MANUAL:
682 hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
683 break;
684 default:
685 hints->domain_attr->data_progress = FI_PROGRESS_UNSPEC;
686 }
687
688 if (MTL_OFI_AV_TABLE == av_type) {
689 hints->domain_attr->av_type = FI_AV_TABLE;
690 } else {
691 hints->domain_attr->av_type = FI_AV_MAP;
692 }
693
694 hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
695
696
697
698
699
700
701 fi_version = FI_VERSION(1, 0);
702
703
704
705
706
707
708 ret = fi_getinfo(fi_version,
709 NULL,
710 NULL,
711 0ULL,
712 hints,
713 &providers);
714 if (FI_ENODATA == -ret) {
715
716 goto error;
717 } else if (0 != ret) {
718 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
719 "fi_getinfo",
720 ompi_process_info.nodename, __FILE__, __LINE__,
721 fi_strerror(-ret), -ret);
722 goto error;
723 }
724
725
726
727
728 prov = select_ofi_provider(providers);
729 if (!prov) {
730 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
731 "%s:%d: select_ofi_provider: no provider found\n",
732 __FILE__, __LINE__);
733 goto error;
734 }
735
736
737
738
739 if ((MTL_OFI_TAG_AUTO == ofi_tag_mode) ||
740 (MTL_OFI_TAG_FULL == ofi_tag_mode)) {
741 ret = ompi_mtl_ofi_check_fi_remote_cq_data(fi_version,
742 hints, prov,
743 &prov_cq_data);
744 if (OMPI_SUCCESS != ret) {
745 goto error;
746 } else if (NULL == prov_cq_data) {
747
748 fi_freeinfo(prov_cq_data);
749 ompi_mtl_ofi.fi_cq_data = false;
750 if (MTL_OFI_TAG_AUTO == ofi_tag_mode) {
751
752 ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
753 } else {
754 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
755 "%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
756 __FILE__, __LINE__, prov->fabric_attr->prov_name);
757 goto error;
758 }
759 } else {
760
761 ompi_mtl_ofi.fi_cq_data = true;
762 prov = prov_cq_data;
763 ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL, &ofi_tag_bits_for_cid);
764 }
765 } else {
766 ompi_mtl_ofi.fi_cq_data = false;
767 ompi_mtl_ofi_define_tag_mode(ofi_tag_mode, &ofi_tag_bits_for_cid);
768 }
769
770
771
772
773
774
775 ompi_mtl_ofi_send_symtable_init(&ompi_mtl_ofi.sym_table);
776 ompi_mtl_ofi.base.mtl_send =
777 ompi_mtl_ofi.sym_table.ompi_mtl_ofi_send[ompi_mtl_ofi.fi_cq_data];
778
779 ompi_mtl_ofi_isend_symtable_init(&ompi_mtl_ofi.sym_table);
780 ompi_mtl_ofi.base.mtl_isend =
781 ompi_mtl_ofi.sym_table.ompi_mtl_ofi_isend[ompi_mtl_ofi.fi_cq_data];
782
783 ompi_mtl_ofi_irecv_symtable_init(&ompi_mtl_ofi.sym_table);
784 ompi_mtl_ofi.base.mtl_irecv =
785 ompi_mtl_ofi.sym_table.ompi_mtl_ofi_irecv[ompi_mtl_ofi.fi_cq_data];
786
787 ompi_mtl_ofi_iprobe_symtable_init(&ompi_mtl_ofi.sym_table);
788 ompi_mtl_ofi.base.mtl_iprobe =
789 ompi_mtl_ofi.sym_table.ompi_mtl_ofi_iprobe[ompi_mtl_ofi.fi_cq_data];
790
791 ompi_mtl_ofi_improbe_symtable_init(&ompi_mtl_ofi.sym_table);
792 ompi_mtl_ofi.base.mtl_improbe =
793 ompi_mtl_ofi.sym_table.ompi_mtl_ofi_improbe[ompi_mtl_ofi.fi_cq_data];
794
795
796
797
798
799
800 ofi_tag_leading_zeros = 0;
801 while (!((prov->ep_attr->mem_tag_format << ofi_tag_leading_zeros++) &
802 (uint64_t) MTL_OFI_HIGHEST_TAG_BIT) &&
803
804 (ofi_tag_bits_for_cid >= MTL_OFI_MINIMUM_CID_BITS)){
805 ofi_tag_bits_for_cid--;
806 }
807
808 if (ofi_tag_bits_for_cid < MTL_OFI_MINIMUM_CID_BITS) {
809 opal_show_help("help-mtl-ofi.txt", "Not enough bits for CID", true,
810 prov->fabric_attr->prov_name,
811 prov->fabric_attr->prov_name,
812 ompi_process_info.nodename, __FILE__, __LINE__);
813 goto error;
814 }
815
816
817 ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << ofi_tag_bits_for_cid) - 1);
818 ompi_mtl_ofi.num_peers = 0;
819
820
821 sep_support_in_provider = 0;
822 if ((prov->domain_attr->max_ep_tx_ctx > 1) ||
823 (prov->domain_attr->max_ep_rx_ctx > 1)) {
824 sep_support_in_provider = 1;
825 }
826
827 if (1 == ompi_mtl_ofi.enable_sep) {
828 if (0 == sep_support_in_provider) {
829 opal_show_help("help-mtl-ofi.txt", "SEP unavailable", true,
830 prov->fabric_attr->prov_name,
831 ompi_process_info.nodename, __FILE__, __LINE__);
832 goto error;
833 } else if (1 == sep_support_in_provider) {
834 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
835 "%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
836 __FILE__, __LINE__, prov->fabric_attr->prov_name);
837 }
838 } else {
839
840
841
842 if (1 == ompi_mtl_ofi.thread_grouping) {
843 opal_show_help("help-mtl-ofi.txt", "SEP required", true,
844 ompi_process_info.nodename, __FILE__, __LINE__);
845 goto error;
846 }
847 }
848
849
850
851
852
853
854
855 ret = fi_fabric(prov->fabric_attr,
856 &ompi_mtl_ofi.fabric,
857 NULL);
858 if (0 != ret) {
859 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
860 "fi_fabric",
861 ompi_process_info.nodename, __FILE__, __LINE__,
862 fi_strerror(-ret), -ret);
863 goto error;
864 }
865
866
867
868
869
870
871 ret = fi_domain(ompi_mtl_ofi.fabric,
872 prov,
873 &ompi_mtl_ofi.domain,
874 NULL);
875 if (0 != ret) {
876 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
877 "fi_domain",
878 ompi_process_info.nodename, __FILE__, __LINE__,
879 fi_strerror(-ret), -ret);
880 goto error;
881 }
882
883
884
885
886 ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
887
888
889
890
891
892
893 if (ompi_mtl_ofi.ofi_progress_event_count > MTL_OFI_MAX_PROG_EVENT_COUNT) {
894 ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
895 }
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912 univ_size_str = getenv("OMPI_UNIVERSE_SIZE");
913 if (NULL == univ_size_str ||
914 (universe_size = strtol(univ_size_str, NULL, 0)) <= 0) {
915 universe_size = ompi_proc_world_size();
916 }
917
918 if (1 == ompi_mtl_ofi.enable_sep) {
919 max_ofi_ctxts = (prov->domain_attr->max_ep_tx_ctx <
920 prov->domain_attr->max_ep_rx_ctx) ?
921 prov->domain_attr->max_ep_tx_ctx :
922 prov->domain_attr->max_ep_rx_ctx;
923
924 num_local_ranks = 1 + ompi_process_info.num_local_peers;
925 if (max_ofi_ctxts <= num_local_ranks) {
926 opal_show_help("help-mtl-ofi.txt", "Local ranks exceed ofi contexts",
927 true, prov->fabric_attr->prov_name,
928 ompi_process_info.nodename, __FILE__, __LINE__);
929 goto error;
930 }
931
932
933 max_ofi_ctxts /= num_local_ranks;
934
935
936
937
938
939 if (max_ofi_ctxts < ompi_mtl_ofi.num_ofi_contexts) {
940 opal_show_help("help-mtl-ofi.txt", "Ctxts exceeded available",
941 true, max_ofi_ctxts,
942 ompi_process_info.nodename, __FILE__, __LINE__);
943 ompi_mtl_ofi.num_ofi_contexts = max_ofi_ctxts;
944 }
945
946 ret = ompi_mtl_ofi_init_sep(prov, universe_size);
947 } else {
948 ret = ompi_mtl_ofi_init_regular_ep(prov, universe_size);
949 }
950
951 if (OMPI_SUCCESS != ret) {
952 goto error;
953 }
954
955 ompi_mtl_ofi.total_ctxts_used = 0;
956 ompi_mtl_ofi.threshold_comm_context_id = 0;
957
958
959 ret = fi_enable(ompi_mtl_ofi.sep);
960 if (0 != ret) {
961 MTL_OFI_LOG_FI_ERR(ret, "fi_enable failed");
962 goto error;
963 }
964
965 ompi_mtl_ofi.provider_name = strdup(prov->fabric_attr->prov_name);
966
967
968
969
970 fi_freeinfo(hints);
971 hints = NULL;
972 fi_freeinfo(providers);
973 providers = NULL;
974
975
976
977
978 namelen = sizeof(ep_name);
979 ret = fi_getname((fid_t)ompi_mtl_ofi.sep,
980 &ep_name[0],
981 &namelen);
982 if (ret) {
983 MTL_OFI_LOG_FI_ERR(ret, "fi_getname failed");
984 goto error;
985 }
986
987 OFI_COMPAT_MODEX_SEND(ret,
988 &mca_mtl_ofi_component.super.mtl_version,
989 &ep_name,
990 namelen);
991 if (OMPI_SUCCESS != ret) {
992 opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
993 "%s:%d: modex_send failed: %d\n",
994 __FILE__, __LINE__, ret);
995 goto error;
996 }
997
998 ompi_mtl_ofi.epnamelen = namelen;
999
1000
1001
1002
1003 ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC;
1004
1005 return &ompi_mtl_ofi.base;
1006
1007 error:
1008 if (providers) {
1009 (void) fi_freeinfo(providers);
1010 }
1011 if (prov_cq_data) {
1012 (void) fi_freeinfo(prov_cq_data);
1013 }
1014 if (hints) {
1015 (void) fi_freeinfo(hints);
1016 }
1017 if (ompi_mtl_ofi.sep) {
1018 (void) fi_close((fid_t)ompi_mtl_ofi.sep);
1019 }
1020 if (ompi_mtl_ofi.av) {
1021 (void) fi_close((fid_t)ompi_mtl_ofi.av);
1022 }
1023 if ((0 == ompi_mtl_ofi.enable_sep) &&
1024 ompi_mtl_ofi.ofi_ctxt != NULL &&
1025 ompi_mtl_ofi.ofi_ctxt[0].cq) {
1026
1027 (void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq);
1028 }
1029 if (ompi_mtl_ofi.domain) {
1030 (void) fi_close((fid_t)ompi_mtl_ofi.domain);
1031 }
1032 if (ompi_mtl_ofi.fabric) {
1033 (void) fi_close((fid_t)ompi_mtl_ofi.fabric);
1034 }
1035 if (ompi_mtl_ofi.comm_to_context) {
1036 free(ompi_mtl_ofi.comm_to_context);
1037 }
1038 if (ompi_mtl_ofi.ofi_ctxt) {
1039 free(ompi_mtl_ofi.ofi_ctxt);
1040 }
1041
1042 return NULL;
1043 }
1044
1045 int
1046 ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
1047 {
1048 ssize_t ret;
1049
1050 opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
1051
1052
1053 if ((ret = fi_close((fid_t)ompi_mtl_ofi.sep))) {
1054 goto finalize_err;
1055 }
1056
1057 if ((ret = fi_close((fid_t)ompi_mtl_ofi.av))) {
1058 goto finalize_err;
1059 }
1060
1061 if (0 == ompi_mtl_ofi.enable_sep) {
1062
1063
1064
1065
1066
1067
1068 if ((ret = fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq))) {
1069 goto finalize_err;
1070 }
1071 }
1072
1073 if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) {
1074 goto finalize_err;
1075 }
1076
1077 if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) {
1078 goto finalize_err;
1079 }
1080
1081
1082 free(ompi_mtl_ofi.comm_to_context);
1083 free(ompi_mtl_ofi.ofi_ctxt);
1084
1085 return OMPI_SUCCESS;
1086
1087 finalize_err:
1088 opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
1089 "fi_close",
1090 ompi_process_info.nodename, __FILE__, __LINE__,
1091 fi_strerror(-ret), -ret);
1092
1093 return OMPI_ERROR;
1094 }
1095
1096
1097