This source file includes following definitions.
- channel_addr2str
- add_procs_block_create_endpoints
- add_procs_warn_unreachable
- add_procs_block_reap_fi_av_inserts
- add_procs_create_endpoints
- usnic_add_procs
- usnic_del_procs
- usnic_register_pml_err_cb
- usnic_alloc
- usnic_free
- pack_chunk_seg_from_frag
- usnic_finalize
- get_send_credits
- usnic_do_resends
- usnic_handle_large_send
- opal_btl_usnic_module_progress_sends
- usnic_send
- usnic_sendi
- usnic_reg_mr
- usnic_dereg_mr
- module_async_event_callback
- create_ep
- finalize_one_channel
- init_one_channel
- get_initial_seq_no
- init_module_globals
- init_local_modex_part1
- init_find_transport_header_len
- init_queue_lengths
- init_payload_lengths
- init_pml_values
- init_senders
- init_connectivity_checker
- init_hwloc
- init_procs
- init_mpool
- init_channels
- init_local_modex_part2
- init_async_event
- init_random_objects
- init_freelists
- opal_btl_usnic_module_init
- usnic_ft_event
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "opal_config.h"
28
29 #include <errno.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include <stdlib.h>
33 #include <time.h>
34
35 #include "opal_stdint.h"
36 #include "opal/class/opal_bitmap.h"
37 #include "opal/prefetch.h"
38 #include "opal/util/output.h"
39 #include "opal/datatype/opal_convertor.h"
40 #include "opal/util/show_help.h"
41 #include "opal/util/printf.h"
42 #include "opal/mca/memchecker/base/base.h"
43
44 #include "opal/mca/btl/btl.h"
45 #include "opal/mca/btl/base/btl_base_error.h"
46 #include "opal/mca/mpool/base/base.h"
47 #include "opal/mca/mpool/mpool.h"
48 #include "opal/mca/rcache/base/base.h"
49 #include "opal/mca/rcache/rcache.h"
50
51 #include "btl_usnic_compat.h"
52 #include "btl_usnic.h"
53 #include "btl_usnic_connectivity.h"
54 #include "btl_usnic_frag.h"
55 #include "btl_usnic_proc.h"
56 #include "btl_usnic_endpoint.h"
57 #include "btl_usnic_module.h"
58 #include "btl_usnic_util.h"
59 #include "btl_usnic_send.h"
60 #include "btl_usnic_ack.h"
61 #include "btl_usnic_hwloc.h"
62 #include "btl_usnic_stats.h"
63
64 static void finalize_one_channel(opal_btl_usnic_module_t *module,
65 struct opal_btl_usnic_channel_t *channel);
66
67 static int channel_addr2str(opal_btl_usnic_module_t *module, int channel,
68 char *str, size_t len_param)
69 {
70 size_t len;
71
72 len = len_param;
73 fi_av_straddr(module->av, module->mod_channels[channel].info->src_addr,
74 str, &len);
75 if (len > len_param) {
76 opal_show_help("help-mpi-btl-usnic.txt",
77 "libfabric API failed",
78 true,
79 opal_process_info.nodename,
80 module->linux_device_name,
81 "fi_av_straddr", __FILE__, __LINE__,
82 FI_ENODATA,
83 "Failed to convert address to string: buffer too short");
84
85 return OPAL_ERR_OUT_OF_RESOURCE;
86 }
87
88 return OPAL_SUCCESS;
89 }
90
91
92
93
94
95
96 static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
97 size_t block_offset,
98 size_t block_len,
99 opal_proc_t **procs,
100 mca_btl_base_endpoint_t **endpoints)
101 {
102 int rc;
103 opal_proc_t* my_proc;
104 size_t num_created = 0;
105
106
107 my_proc = opal_proc_local_get();
108 if (NULL == my_proc) {
109 return OPAL_ERR_OUT_OF_RESOURCE;
110 }
111
112
113 for (size_t i = block_offset; i < (block_offset + block_len); i++) {
114 struct opal_proc_t* opal_proc = procs[i];
115 opal_btl_usnic_proc_t* usnic_proc;
116 mca_btl_base_endpoint_t* usnic_endpoint;
117
118 endpoints[i] = NULL;
119
120
121 if (opal_proc == my_proc) {
122 opal_output_verbose(75, USNIC_OUT,
123 "btl:usnic:add_procs:%s: not connecting to self",
124 module->linux_device_name);
125 continue;
126 }
127
128
129 if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
130 opal_output_verbose(75, USNIC_OUT,
131 "btl:usnic:add_procs:%s: not connecting to %s on same server",
132 module->linux_device_name,
133 usnic_compat_proc_name_print(&opal_proc->proc_name));
134 continue;
135 }
136
137
138
139
140
141 usnic_proc = NULL;
142 rc = opal_btl_usnic_proc_match(opal_proc, module, &usnic_proc);
143 if (OPAL_ERR_UNREACH == rc) {
144
145
146 opal_output_verbose(75, USNIC_OUT,
147 "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
148 module->linux_device_name,
149 usnic_compat_proc_name_print(&opal_proc->proc_name),
150 opal_get_proc_hostname(opal_proc));
151 continue;
152 } else if (OPAL_SUCCESS != rc) {
153 return OPAL_ERR_OUT_OF_RESOURCE;
154 }
155
156
157
158 usnic_endpoint = NULL;
159 rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
160 &usnic_endpoint);
161 if (OPAL_SUCCESS != rc) {
162 opal_output_verbose(5, USNIC_OUT,
163 "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
164 module->linux_device_name,
165 usnic_compat_proc_name_print(&opal_proc->proc_name),
166 opal_get_proc_hostname(opal_proc));
167 OBJ_RELEASE(usnic_proc);
168 continue;
169 }
170
171
172 opal_pointer_array_add(&module->all_procs, usnic_proc);
173
174 char str[IPV4STRADDRLEN];
175 struct opal_btl_usnic_modex_t *modex =
176 &usnic_endpoint->endpoint_remote_modex;
177 opal_btl_usnic_snprintf_ipv4_addr(str, sizeof(str),
178 modex->ipv4_addr,
179 modex->netmask);
180
181 char local_pri_addr[64] = {0};
182 rc = channel_addr2str(module, USNIC_PRIORITY_CHANNEL,
183 local_pri_addr, sizeof(local_pri_addr));
184 if (OPAL_SUCCESS != rc) {
185 OBJ_RELEASE(usnic_proc);
186 continue;
187 }
188
189 char local_data_addr[64] = {0};
190 rc = channel_addr2str(module, USNIC_DATA_CHANNEL,
191 local_data_addr, sizeof(local_data_addr));
192 if (OPAL_SUCCESS != rc) {
193 OBJ_RELEASE(usnic_proc);
194 continue;
195 }
196
197 opal_output_verbose(5, USNIC_OUT,
198 "btl:usnic:add_procs:%s: new usnic peer endpoint: pri=%s:%d, data=%s:%d (local: pri=%s, data=%s)",
199 module->linux_device_name,
200 str, modex->ports[USNIC_PRIORITY_CHANNEL],
201 str, modex->ports[USNIC_DATA_CHANNEL],
202 local_pri_addr,
203 local_data_addr);
204
205 endpoints[i] = usnic_endpoint;
206 ++num_created;
207 }
208
209 opal_output_verbose(5, USNIC_OUT,
210 "btl:usnic: made %" PRIsize_t " endpoints",
211 num_created);
212 return OPAL_SUCCESS;
213 }
214
215
216
217
218
219
220
221 static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
222 opal_btl_usnic_endpoint_t *endpoint)
223 {
224
225 if (!mca_btl_usnic_component.show_route_failures) {
226 return;
227 }
228
229 char remote[IPV4STRADDRLEN];
230 opal_btl_usnic_snprintf_ipv4_addr(remote, sizeof(remote),
231 endpoint->endpoint_remote_modex.ipv4_addr,
232 endpoint->endpoint_remote_modex.netmask);
233
234 opal_output_verbose(15, USNIC_OUT,
235 "btl:usnic: %s (which is %s) couldn't reach peer %s",
236 module->linux_device_name,
237 module->if_ipv4_addr_str,
238 remote);
239 opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
240 true,
241 opal_process_info.nodename,
242 module->if_ipv4_addr_str,
243 module->linux_device_name,
244 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
245 remote);
246 }
247
248
249
250
251 static int
252 add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
253 size_t block_offset,
254 size_t block_len,
255 struct mca_btl_base_endpoint_t **endpoints)
256 {
257 int ret = OPAL_SUCCESS;
258 int num_left;
259 size_t i, channel;
260 uint32_t event;
261 struct fi_eq_entry entry;
262 struct fi_eq_err_entry err_entry;
263 bool error_occurred = false;
264
265
266 num_left = 0;
267 for (i = block_offset; i < (block_offset + block_len); ++i) {
268 if (NULL != endpoints[i]) {
269 num_left += USNIC_NUM_CHANNELS;
270 }
271 }
272
273
274
275 while (num_left > 0) {
276 opal_btl_usnic_addr_context_t *context;
277
278 ret = fi_eq_sread(module->av_eq, &event, &entry, sizeof(entry), -1, 0);
279
280
281
282
283
284 if (sizeof(entry) == ret) {
285 context = entry.context;
286 free(context);
287 --num_left;
288 ret = 0;
289 }
290
291
292
293
294
295
296
297
298
299
300
301
302 else if (-FI_EAVAIL == ret) {
303 ret = fi_eq_readerr(module->av_eq, &err_entry, 0);
304 if (sizeof(err_entry) == ret) {
305 context = err_entry.context;
306
307
308
309
310
311 if (EADDRNOTAVAIL == err_entry.err ||
312 EHOSTUNREACH == err_entry.err) {
313
314
315
316
317
318
319
320
321
322
323 for (i = block_offset; i < (block_offset + block_len); ++i) {
324 if (endpoints[i] == context->endpoint) {
325 add_procs_warn_unreachable(module,
326 context->endpoint);
327 OBJ_RELEASE(context->endpoint);
328 endpoints[i] = NULL;
329 break;
330 }
331 }
332 ret = 0;
333 }
334
335
336
337 else {
338 opal_show_help("help-mpi-btl-usnic.txt",
339 "libfabric API failed",
340 true,
341 opal_process_info.nodename,
342 module->linux_device_name,
343 "async insertion result", __FILE__, __LINE__,
344 err_entry.err,
345 "Failed to insert address to AV");
346 ret = OPAL_ERR_OUT_OF_RESOURCE;
347 error_occurred = true;
348
349
350
351 }
352
353
354
355
356
357 } else {
358
359
360
361
362 opal_show_help("help-mpi-btl-usnic.txt",
363 "internal error during init",
364 true,
365 opal_process_info.nodename,
366 module->linux_device_name,
367 "fi_eq_readerr()", __FILE__, __LINE__,
368 ret,
369 "Returned != sizeof(err_entry)");
370 ret = OPAL_ERR_OUT_OF_RESOURCE;
371 error_occurred = true;
372
373
374
375 opal_btl_usnic_exit(module);
376 }
377 } else {
378
379
380
381
382
383 opal_show_help("help-mpi-btl-usnic.txt",
384 "internal error during init",
385 true,
386 opal_process_info.nodename,
387 module->linux_device_name,
388 "fi_eq_sread()", __FILE__, __LINE__,
389 ret,
390 "Returned != (sizeof(entry) or -FI_EAVAIL)");
391 ret = OPAL_ERR_OUT_OF_RESOURCE;
392 error_occurred = true;
393
394
395
396 opal_btl_usnic_exit(module);
397 }
398 }
399
400
401
402
403
404 size_t num_endpoints_created = 0;
405 for (i = block_offset; i < (block_offset + block_len); i++) {
406 if (NULL != endpoints[i]) {
407 bool happy;
408
409 happy = true;
410 if (error_occurred) {
411 happy = false;
412 } else {
413 for (channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) {
414 if (FI_ADDR_NOTAVAIL ==
415 endpoints[i]->endpoint_remote_addrs[channel]) {
416 happy = false;
417 break;
418 }
419 }
420 }
421
422 if (happy) {
423 ++num_endpoints_created;
424 } else {
425 OBJ_RELEASE(endpoints[i]);
426 endpoints[i] = NULL;
427 }
428 }
429 }
430
431
432 opal_output_verbose(5, USNIC_OUT,
433 "btl:usnic: created destinations for %" PRIsize_t
434 " endpoints",
435 num_endpoints_created);
436 return ret;
437 }
438
439
440
441
442 static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
443 size_t nprocs,
444 struct opal_proc_t **procs,
445 struct mca_btl_base_endpoint_t** endpoints)
446 {
447
448
449
450
451
452
453
454
455 if (module->av_eq_size < 8) {
456 opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
457 true,
458 opal_process_info.nodename,
459 module->av_eq_size,
460 8);
461 return OPAL_ERR_OUT_OF_RESOURCE;
462 }
463
464 size_t eq_size = module->av_eq_size - 8;
465 size_t block_len = eq_size;
466 size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
467 size_t num_blocks = num_av_inserts / block_len;
468 if (num_av_inserts % block_len != 0) {
469 ++num_blocks;
470 }
471
472
473
474
475 block_len /= USNIC_NUM_CHANNELS;
476
477
478
479 int rc;
480 for (size_t block_offset = 0, block = 0; block < num_blocks;
481 block_offset += block_len, ++block) {
482
483 if (block_len > (nprocs - block_offset)) {
484 block_len = nprocs - block_offset;
485 }
486
487
488
489 rc = add_procs_block_create_endpoints(module,
490 block_offset, block_len,
491 procs, endpoints);
492 if (OPAL_SUCCESS != rc) {
493 return rc;
494 }
495
496
497
498
499
500
501 rc = add_procs_block_reap_fi_av_inserts(module,
502 block_offset, block_len,
503 endpoints);
504 if (OPAL_SUCCESS != rc) {
505 return rc;
506 }
507 }
508
509 return OPAL_SUCCESS;
510 }
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529 static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
530 size_t nprocs,
531 struct opal_proc_t **procs,
532 struct mca_btl_base_endpoint_t** endpoints,
533 opal_bitmap_t* reachable)
534 {
535 opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
536 int rc;
537
538
539
540 rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
541 if (OPAL_SUCCESS != rc) {
542 goto fail;
543 }
544
545
546
547 for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
548 if (NULL != endpoints[i]) {
549 bool happy = true;
550 for (int channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) {
551 if (FI_ADDR_NOTAVAIL ==
552 endpoints[i]->endpoint_remote_addrs[channel]) {
553 happy = false;
554 break;
555 }
556 }
557
558 if (happy) {
559 opal_bitmap_set_bit(reachable, i);
560 }
561 }
562 }
563
564
565
566
567
568
569
570
571 static int num_times_add_procs_called = 0;
572 ++num_times_add_procs_called;
573 if (0 == (num_times_add_procs_called %
574 mca_btl_usnic_component.num_modules)) {
575 opal_btl_usnic_connectivity_map();
576 }
577
578 return OPAL_SUCCESS;
579
580 fail:
581
582
583
584 for (size_t i = 0; i < nprocs; ++i) {
585 if (NULL != endpoints[i]) {
586 OBJ_RELEASE(endpoints[i]);
587 endpoints[i] = NULL;
588 }
589 }
590
591 return rc;
592 }
593
594
595
596
597
598
599
600 static int usnic_del_procs(struct mca_btl_base_module_t *base_module,
601 size_t nprocs,
602 struct opal_proc_t **procs,
603 struct mca_btl_base_endpoint_t **peers)
604 {
605 size_t i, j;
606 opal_btl_usnic_module_t *module;
607 opal_btl_usnic_endpoint_t *endpoint;
608 int index;
609
610 module = (struct opal_btl_usnic_module_t *)base_module;
611
612 for (i = 0; i < nprocs; i++) {
613 opal_btl_usnic_proc_t* proc =
614 opal_btl_usnic_proc_lookup_ompi(procs[i]);
615 if (NULL != proc) {
616
617
618 for (j = 0; j < proc->proc_endpoint_count; ++j) {
619 endpoint = proc->proc_endpoints[j];
620 if (NULL != endpoint && endpoint->endpoint_module == module) {
621
622
623
624
625
626
627
628
629
630 if (!ENDPOINT_DRAINED(endpoint)) {
631 opal_btl_usnic_flush_endpoint(endpoint);
632 }
633
634
635 OBJ_RELEASE(endpoint);
636
637 break;
638 }
639 }
640
641
642 for (index = 0; index < module->all_procs.size; ++index) {
643 if (opal_pointer_array_get_item(&module->all_procs, index) ==
644 proc) {
645 OBJ_RELEASE(proc);
646 opal_pointer_array_set_item(&module->all_procs, index,
647 NULL);
648 break;
649 }
650 }
651 }
652 }
653
654 return OPAL_SUCCESS;
655 }
656
657
658
659
660
661 static int usnic_register_pml_err_cb(struct mca_btl_base_module_t* btl,
662 mca_btl_base_module_error_cb_fn_t cbfunc)
663 {
664 opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) btl;
665
666 module->pml_error_callback = cbfunc;
667
668 return OPAL_SUCCESS;
669 }
670
671
672
673
674
675
676
677
678 static mca_btl_base_descriptor_t*
679 usnic_alloc(struct mca_btl_base_module_t* btl,
680 struct mca_btl_base_endpoint_t* endpoint,
681 uint8_t order,
682 size_t size,
683 uint32_t flags)
684 {
685 opal_btl_usnic_send_frag_t *frag;
686 opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) btl;
687 mca_btl_base_descriptor_t *desc;
688
689
690
691 if (OPAL_LIKELY(size <= module->max_frag_payload)) {
692 opal_btl_usnic_small_send_frag_t *sfrag;
693
694 sfrag = opal_btl_usnic_small_send_frag_alloc(module);
695 if (NULL == sfrag) {
696 return NULL;
697 }
698 frag = &sfrag->ssf_base;
699
700
701
702
703
704 } else {
705 opal_btl_usnic_large_send_frag_t *lfrag;
706
707
708 if (OPAL_UNLIKELY(size > module->super.btl_eager_limit)) {
709 size = module->super.btl_eager_limit;
710 }
711
712 lfrag = opal_btl_usnic_large_send_frag_alloc(module);
713 if (OPAL_UNLIKELY(NULL == lfrag)) {
714 return NULL;
715 }
716 frag = &lfrag->lsf_base;
717
718 assert(size > 0);
719 lfrag->lsf_buffer = malloc(size);
720 if (OPAL_UNLIKELY(NULL == lfrag->lsf_buffer)) {
721 opal_btl_usnic_frag_return(module, &lfrag->lsf_base.sf_base);
722 return NULL;
723 }
724
725
726 frag->sf_base.uf_base.USNIC_SEND_LOCAL[0].seg_addr.pval =
727 lfrag->lsf_buffer;
728
729 MSGDEBUG1_OUT("usnic_alloc: packing frag %p on the fly", (void *)frag);
730 lfrag->lsf_pack_on_the_fly = true;
731 }
732
733 #if MSGDEBUG2
734 opal_output(0, "usnic_alloc: %s frag=%p, size=%d, flags=0x%x\n",
735 (size <= module->max_frag_payload)?"small":"large",
736 (void *)frag, (int)size, flags);
737 #endif
738
739
740 frag->sf_endpoint = endpoint;
741
742
743 desc = &frag->sf_base.uf_base;
744 desc->des_flags = flags;
745 desc->USNIC_SEND_LOCAL[0].seg_len = size;
746 desc->USNIC_SEND_LOCAL_COUNT = 1;
747
748 return desc;
749 }
750
751
752
753
754
755
756
757 static int usnic_free(struct mca_btl_base_module_t* btl,
758 mca_btl_base_descriptor_t* des)
759 {
760 opal_btl_usnic_frag_t* frag = (opal_btl_usnic_frag_t*)des;
761
762 #if MSGDEBUG2
763 opal_output(0, "usnic_free: %p (%s)\n", (void*)frag,
764 usnic_frag_type(frag->uf_type));
765 #endif
766
767
768
769 frag->uf_base.des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
770
771 opal_btl_usnic_frag_return_cond((struct opal_btl_usnic_module_t *)btl,
772 frag);
773
774 return OPAL_SUCCESS;
775 }
776
777
778
779
780
781
782
783 static
784 opal_btl_usnic_chunk_segment_t *
785 pack_chunk_seg_from_frag(
786 struct opal_btl_usnic_module_t* module,
787 opal_btl_usnic_large_send_frag_t *lfrag)
788 {
789 opal_btl_usnic_chunk_segment_t *seg;
790 uint8_t *copyptr;
791 size_t copylen;
792 size_t seg_space;
793 size_t max_data;
794
795 assert(NULL != lfrag);
796
797 assert(lfrag->lsf_pack_bytes_left > 0);
798
799 seg = opal_btl_usnic_chunk_segment_alloc(module);
800 if (OPAL_UNLIKELY(NULL == seg)) {
801
802
803
804 opal_btl_usnic_util_abort("chunk segment allocation error",
805 __FILE__, __LINE__);
806 }
807
808 seg_space = module->max_chunk_payload;
809 copyptr = seg->ss_base.us_payload.raw;
810
811
812
813
814 while (seg_space > 0 &&
815 lfrag->lsf_pack_bytes_left > 0 &&
816 NULL != lfrag->lsf_cur_ptr) {
817 if (seg_space > lfrag->lsf_bytes_left_in_sge) {
818 copylen = lfrag->lsf_bytes_left_in_sge;
819 } else {
820 copylen = seg_space;
821 }
822
823 memcpy(copyptr, lfrag->lsf_cur_ptr, copylen);
824 seg_space -= copylen;
825 copyptr += copylen;
826 lfrag->lsf_bytes_left_in_sge -= copylen;
827 lfrag->lsf_pack_bytes_left -= copylen;
828 if (lfrag->lsf_bytes_left_in_sge > 0) {
829 lfrag->lsf_cur_ptr += copylen;
830 } else {
831 ++lfrag->lsf_cur_sge;
832 lfrag->lsf_cur_ptr =
833 lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_addr.pval;
834 lfrag->lsf_bytes_left_in_sge =
835 lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_len;
836 }
837 }
838
839 if (seg_space > 0 && lfrag->lsf_pack_bytes_left > 0) {
840
841 assert(NULL == lfrag->lsf_cur_ptr);
842 assert(1 == lfrag->lsf_cur_sge);
843
844 copylen = lfrag->lsf_pack_bytes_left;
845 if (copylen > seg_space) {
846 copylen = seg_space;
847 }
848 usnic_convertor_pack_simple(&lfrag->lsf_base.sf_convertor, copyptr,
849 copylen, &max_data);
850 seg_space -= max_data;
851 lfrag->lsf_bytes_left_in_sge -= max_data;
852 lfrag->lsf_pack_bytes_left -= max_data;
853 }
854
855 MSGDEBUG1_OUT("%s: packed seg=%p, frag=%p, payload=%zd\n",
856 __func__, (void *)seg, (void *)lfrag,
857 (module->max_chunk_payload - seg_space));
858
859 assert(lfrag->lsf_cur_sge <= 2);
860 assert(seg_space < module->max_chunk_payload);
861
862 seg->ss_parent_frag = &lfrag->lsf_base;
863 seg->ss_len = module->max_chunk_payload - seg_space;
864
865 return seg;
866 }
867
868 static int usnic_finalize(struct mca_btl_base_module_t* btl)
869 {
870 opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*)btl;
871
872 if (module->device_async_event_active) {
873 opal_event_del(&(module->device_async_event));
874 module->device_async_event_active = false;
875 }
876
877 opal_btl_usnic_connectivity_unlisten(module);
878
879 finalize_one_channel(module,
880 &module->mod_channels[USNIC_DATA_CHANNEL]);
881 finalize_one_channel(module,
882 &module->mod_channels[USNIC_PRIORITY_CHANNEL]);
883
884
885 opal_btl_usnic_stats_finalize(module);
886
887
888
889
890 opal_mutex_lock(&module->all_endpoints_lock);
891 OBJ_DESTRUCT(&(module->all_endpoints));
892 module->all_endpoints_constructed = false;
893 opal_mutex_unlock(&module->all_endpoints_lock);
894
895
896 assert(opal_list_is_empty(&(module->pending_resend_segs)));
897 OBJ_DESTRUCT(&module->pending_resend_segs);
898
899
900
901
902 while (!opal_list_is_empty(&(module->endpoints_that_need_acks))) {
903 (void) opal_list_remove_first(&(module->endpoints_that_need_acks));
904 }
905 OBJ_DESTRUCT(&module->endpoints_that_need_acks);
906
907
908
909
910 OBJ_DESTRUCT(&module->all_procs);
911
912 for (int i = module->first_pool; i <= module->last_pool; ++i) {
913 OBJ_DESTRUCT(&module->module_recv_buffers[i]);
914 }
915 free(module->module_recv_buffers);
916
917 OBJ_DESTRUCT(&module->ack_segs);
918 OBJ_DESTRUCT(&module->endpoints_with_sends);
919 OBJ_DESTRUCT(&module->small_send_frags);
920 OBJ_DESTRUCT(&module->large_send_frags);
921 OBJ_DESTRUCT(&module->put_dest_frags);
922 OBJ_DESTRUCT(&module->chunk_segs);
923 OBJ_DESTRUCT(&module->senders);
924
925 mca_rcache_base_module_destroy(module->rcache);
926
927 if (NULL != module->av) {
928 fi_close(&module->av->fid);
929 }
930 if (NULL != module->av_eq) {
931 fi_close(&module->av_eq->fid);
932 }
933 if (NULL != module->dom_eq) {
934 fi_close(&module->dom_eq->fid);
935 }
936 fi_close(&module->domain->fid);
937 fi_close(&module->fabric->fid);
938
939 free(module->linux_device_name);
940
941 return OPAL_SUCCESS;
942 }
943
944 static inline unsigned
945 get_send_credits(struct opal_btl_usnic_channel_t *chan)
946 {
947 return chan->credits;
948 }
949
950 static void
951 usnic_do_resends(
952 opal_btl_usnic_module_t *module)
953 {
954 opal_btl_usnic_send_segment_t *sseg;
955 opal_btl_usnic_endpoint_t *endpoint;
956 struct opal_btl_usnic_channel_t *data_channel;
957 int ret;
958
959 data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
960
961 while ((get_send_credits(data_channel) > 1) &&
962 !opal_list_is_empty(&module->pending_resend_segs)) {
963
964
965
966
967
968 sseg = (opal_btl_usnic_send_segment_t *)
969 opal_list_remove_first(&module->pending_resend_segs);
970 endpoint = sseg->ss_parent_frag->sf_endpoint;
971
972
973 sseg->ss_base.us_btl_header->ack_present = 0;
974
975
976 if (sseg->ss_send_posted == 0) {
977
978
979 sseg->ss_channel = USNIC_DATA_CHANNEL;
980
981
982 opal_btl_usnic_post_segment(module, endpoint, sseg);
983
984
985
986
987
988
989
990
991
992 --endpoint->endpoint_send_credits;
993 ++module->stats.num_resends;
994 }
995
996
997 ret = opal_hotel_checkin(&endpoint->endpoint_hotel,
998 sseg, &sseg->ss_hotel_room);
999 if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
1000 opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__);
1001 }
1002 }
1003 }
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014 static void
1015 usnic_handle_large_send(
1016 opal_btl_usnic_module_t *module,
1017 opal_btl_usnic_endpoint_t *endpoint,
1018 opal_btl_usnic_send_frag_t *frag)
1019 {
1020 opal_btl_usnic_large_send_frag_t *lfrag;
1021 opal_btl_usnic_btl_chunk_header_t *chp;
1022 opal_btl_usnic_send_segment_t *sseg;
1023 size_t payload_len;
1024
1025 assert(frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_LARGE_SEND);
1026 lfrag = (opal_btl_usnic_large_send_frag_t *)frag;
1027 if (lfrag->lsf_cur_offset == 0) {
1028
1029 do {
1030 lfrag->lsf_frag_id = endpoint->endpoint_next_frag_id++;
1031 } while (lfrag->lsf_frag_id == 0);
1032 }
1033
1034 if (lfrag->lsf_pack_on_the_fly) {
1035 assert(opal_list_is_empty(&lfrag->lsf_seg_chain));
1036
1037
1038 sseg = pack_chunk_seg_from_frag(module, lfrag);
1039 } else {
1040
1041 sseg = (opal_btl_usnic_send_segment_t *)
1042 opal_list_remove_first(&lfrag->lsf_seg_chain);
1043 }
1044
1045 assert(NULL != sseg);
1046 payload_len = sseg->ss_len;
1047
1048 assert(payload_len > 0);
1049 assert(payload_len <= module->max_chunk_payload);
1050 assert(lfrag->lsf_bytes_left >= payload_len);
1051
1052
1053 sseg->ss_len = sizeof(opal_btl_usnic_btl_chunk_header_t) + payload_len;
1054 lfrag->lsf_bytes_left -= payload_len;
1055
1056
1057 chp = sseg->ss_base.us_btl_chunk_header;
1058 chp->ch_frag_id = lfrag->lsf_frag_id;
1059 chp->ch_frag_size = lfrag->lsf_base.sf_size;
1060 chp->ch_frag_offset = lfrag->lsf_cur_offset;
1061 chp->ch_hdr.tag = lfrag->lsf_tag;
1062
1063
1064 sseg->ss_base.us_btl_header->payload_len = payload_len;
1065
1066
1067
1068 opal_btl_usnic_endpoint_send_segment(module, sseg);
1069
1070
1071 lfrag->lsf_cur_offset += payload_len;
1072
1073 #if MSGDEBUG1
1074 opal_output(0, "%s: payload_len=%zd, bytes_left=%zd on_the_fly=%s\n",
1075 __func__, payload_len, lfrag->lsf_bytes_left,
1076 lfrag->lsf_pack_on_the_fly?"true":"false");
1077 #endif
1078
1079 if (lfrag->lsf_bytes_left == 0) {
1080
1081
1082
1083
1084 opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
1085 &frag->sf_base.uf_base.super.super);
1086
1087
1088
1089
1090 if (frag->sf_base.uf_remote_seg[0].seg_addr.pval == NULL &&
1091 (frag->sf_base.uf_base.des_flags &
1092 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
1093
1094 OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "large");
1095 }
1096 }
1097 }
1098
1099
1100
1101
1102
1103
1104 void
1105 opal_btl_usnic_module_progress_sends(
1106 opal_btl_usnic_module_t *module)
1107 {
1108 opal_btl_usnic_send_frag_t *frag;
1109 opal_btl_usnic_send_segment_t *sseg;
1110 opal_btl_usnic_endpoint_t *endpoint;
1111 struct opal_btl_usnic_channel_t *data_channel;
1112 struct opal_btl_usnic_channel_t *prio_channel;
1113
1114
1115
1116
1117
1118
1119
1120 data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
1121 prio_channel = &module->mod_channels[USNIC_PRIORITY_CHANNEL];
1122
1123
1124
1125
1126 OPAL_THREAD_LOCK(&btl_usnic_lock);
1127 if (OPAL_UNLIKELY(!opal_list_is_empty(&module->pending_resend_segs))) {
1128 usnic_do_resends(module);
1129 }
1130
1131
1132
1133
1134 while ((get_send_credits(data_channel) > 1) &&
1135 !opal_list_is_empty(&module->endpoints_with_sends)) {
1136 opal_btl_usnic_small_send_frag_t *sfrag;
1137 size_t payload_len;
1138
1139
1140
1141
1142
1143
1144
1145
1146 endpoint = (opal_btl_usnic_endpoint_t *)
1147 opal_list_get_first(&module->endpoints_with_sends);
1148 frag = (opal_btl_usnic_send_frag_t *)
1149 opal_list_get_first(&endpoint->endpoint_frag_send_queue);
1150
1151
1152
1153
1154
1155 if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND) {
1156
1157
1158
1159
1160 opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
1161 &frag->sf_base.uf_base.super.super);
1162
1163 sfrag = (opal_btl_usnic_small_send_frag_t *)frag;
1164 sseg = &sfrag->ssf_segment;
1165
1166
1167 payload_len = sfrag->ssf_base.sf_size;
1168 sseg->ss_base.us_btl_header->payload_len = payload_len;
1169
1170 #if MSGDEBUG1
1171 opal_output(0, "progress send small, frag=%p, ptr=%p, payload=%zd, len=%"PRIu32", ep=%p, tag=%d\n",
1172 (void *)frag,
1173 (void *)sseg->ss_ptr, payload_len,
1174 sseg->ss_len,
1175 (void *)frag->sf_endpoint,
1176 sseg->ss_base.us_btl_header->tag);
1177 #endif
1178
1179
1180 opal_btl_usnic_endpoint_send_segment(module, sseg);
1181
1182
1183 if (frag->sf_base.uf_remote_seg[0].seg_addr.pval == NULL) {
1184
1185
1186
1187
1188
1189
1190 if ((frag->sf_base.uf_base.des_flags &
1191 (MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
1192 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) ==
1193 (MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
1194 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
1195 OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "small");
1196 }
1197 }
1198
1199
1200 } else {
1201 usnic_handle_large_send(module, endpoint, frag);
1202 }
1203
1204
1205
1206
1207 if (opal_list_is_empty(&endpoint->endpoint_frag_send_queue) ||
1208 endpoint->endpoint_send_credits <= 0 ||
1209 !WINDOW_OPEN(endpoint)) {
1210
1211 opal_list_remove_item(&module->endpoints_with_sends,
1212 &endpoint->super);
1213 endpoint->endpoint_ready_to_send = false;
1214 }
1215 }
1216
1217
1218
1219
1220 endpoint = opal_btl_usnic_get_first_endpoint_needing_ack(module);
1221 while (get_send_credits(prio_channel) > 1 && endpoint != NULL) {
1222 opal_btl_usnic_endpoint_t *next_endpoint;
1223
1224
1225 next_endpoint = opal_btl_usnic_get_next_endpoint_needing_ack(endpoint);
1226
1227
1228 if (endpoint->endpoint_acktime == 0 ||
1229 endpoint->endpoint_acktime <= get_nsec()) {
1230 if (OPAL_LIKELY(opal_btl_usnic_ack_send(module, endpoint) == OPAL_SUCCESS)) {
1231 opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint);
1232 } else {
1233
1234
1235 break;
1236 }
1237 }
1238
1239 endpoint = next_endpoint;
1240 }
1241 OPAL_THREAD_UNLOCK(&btl_usnic_lock);
1242 }
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262 static int
1263 usnic_send(
1264 struct mca_btl_base_module_t* base_module,
1265 struct mca_btl_base_endpoint_t* base_endpoint,
1266 struct mca_btl_base_descriptor_t* descriptor,
1267 mca_btl_base_tag_t tag)
1268 {
1269 int rc;
1270 opal_btl_usnic_send_frag_t *frag;
1271 opal_btl_usnic_small_send_frag_t *sfrag;
1272 opal_btl_usnic_endpoint_t *endpoint;
1273 opal_btl_usnic_module_t *module;
1274 opal_btl_usnic_send_segment_t *sseg;
1275
1276 OPAL_THREAD_LOCK(&btl_usnic_lock);
1277 endpoint = (opal_btl_usnic_endpoint_t *)base_endpoint;
1278 module = (opal_btl_usnic_module_t *)base_module;
1279 frag = (opal_btl_usnic_send_frag_t*) descriptor;
1280
1281 assert(frag->sf_endpoint == endpoint);
1282 frag->sf_base.uf_remote_seg[0].seg_addr.pval = NULL;
1283
1284 opal_btl_usnic_compute_sf_size(frag);
1285 frag->sf_ack_bytes_left = frag->sf_size;
1286
1287 #if MSGDEBUG2
1288 opal_output(0, "usnic_send: frag=%p, endpoint=%p, tag=%d, sf_size=%d\n",
1289 (void *)frag, (void *)endpoint,
1290 tag, (int)frag->sf_size);
1291 #if MSGDEBUG1
1292 { unsigned i;
1293 opal_output(0, " descriptor->des_flags=0x%x\n", descriptor->des_flags);
1294 for (i=0; i<descriptor->USNIC_SEND_LOCAL_COUNT; ++i) {
1295 opal_output(0, " %d: ptr:%p len:%d\n", i,
1296 descriptor->USNIC_SEND_LOCAL[i].seg_addr.pval,
1297 descriptor->USNIC_SEND_LOCAL[i].seg_len);
1298 }
1299 }
1300 #endif
1301 #endif
1302
1303
1304
1305
1306
1307
1308 if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND &&
1309 frag->sf_ack_bytes_left < module->max_tiny_payload &&
1310 WINDOW_OPEN(endpoint) &&
1311 (get_send_credits(&module->mod_channels[USNIC_DATA_CHANNEL]) >=
1312 module->mod_channels[USNIC_DATA_CHANNEL].fastsend_wqe_thresh)) {
1313 size_t payload_len;
1314
1315 sfrag = (opal_btl_usnic_small_send_frag_t *)frag;
1316 sseg = &sfrag->ssf_segment;
1317
1318 payload_len = frag->sf_ack_bytes_left;
1319 sseg->ss_base.us_btl_header->payload_len = payload_len;
1320
1321
1322
1323 if (frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT > 1) {
1324 memcpy(((char *)(intptr_t)frag->sf_base.uf_local_seg[0].seg_addr.lval +
1325 frag->sf_base.uf_local_seg[0].seg_len),
1326 frag->sf_base.uf_local_seg[1].seg_addr.pval,
1327 frag->sf_base.uf_local_seg[1].seg_len);
1328
1329
1330 frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
1331 frag->sf_base.uf_local_seg[0].seg_len +=
1332 frag->sf_base.uf_local_seg[1].seg_len;
1333 }
1334
1335
1336 sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size;
1337
1338 sseg->ss_channel = USNIC_DATA_CHANNEL;
1339 sseg->ss_base.us_btl_header->tag = tag;
1340 #if MSGDEBUG1
1341 opal_output(0, "INLINE send, sseg=%p", (void *)sseg);
1342 #endif
1343
1344
1345 opal_btl_usnic_endpoint_send_segment(module, sseg);
1346
1347
1348
1349
1350
1351
1352 if (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
1353 if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
1354 OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "immediate small");
1355 rc = 0;
1356 } else {
1357 #if MSGDEBUG1
1358 opal_output(0, "skipping callback for frag %p, returning 1\n", (void *)frag);
1359 #endif
1360 rc = 1;
1361 ++module->stats.pml_send_callbacks;
1362 }
1363 } else {
1364 #if MSGDEBUG1
1365 opal_output(0, "don't own descriptor, defer callback for frag %p\n", (void *)frag);
1366 #endif
1367 descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
1368 rc = 0;
1369 }
1370 } else {
1371
1372
1373
1374
1375
1376
1377
1378 rc = opal_btl_usnic_finish_put_or_send(module, endpoint, frag, tag);
1379
1380 frag->sf_base.uf_base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
1381 }
1382
1383 ++module->stats.pml_module_sends;
1384
1385 OPAL_THREAD_UNLOCK(&btl_usnic_lock);
1386 return rc;
1387 }
1388
1389 #if 0
1390
1391
1392
1393 static int usnic_sendi(struct mca_btl_base_module_t* btl,
1394 struct mca_btl_base_endpoint_t* endpoint,
1395 struct opal_convertor_t* convertor,
1396 void* header,
1397 size_t header_size,
1398 size_t payload_size,
1399 uint8_t order,
1400 uint32_t flags,
1401 mca_btl_base_tag_t tag,
1402 mca_btl_base_descriptor_t** descriptor)
1403 {
1404
1405 return OPAL_ERROR;
1406 }
1407 #endif
1408
1409
1410
1411
1412
1413 static int usnic_reg_mr(void* reg_data, void* base, size_t size,
1414 mca_rcache_base_registration_t* reg)
1415 {
1416 opal_btl_usnic_module_t* mod = (opal_btl_usnic_module_t*)reg_data;
1417 opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
1418 int rc;
1419
1420 rc = fi_mr_reg(mod->domain, base, size, 0, 0, 0, 0, &ur->ur_mr, NULL);
1421 if (0 != rc) {
1422 return OPAL_ERR_OUT_OF_RESOURCE;
1423 }
1424
1425 return OPAL_SUCCESS;
1426 }
1427
1428 static int usnic_dereg_mr(void* reg_data,
1429 mca_rcache_base_registration_t* reg)
1430 {
1431 opal_btl_usnic_reg_t* ur = (opal_btl_usnic_reg_t*)reg;
1432
1433 if (ur->ur_mr != NULL) {
1434 if (0 != fi_close(&ur->ur_mr->fid)) {
1435 opal_output(0, "%s: error unpinning USD memory mr=%p: %s\n",
1436 __func__, (void*) ur->ur_mr, strerror(errno));
1437 return OPAL_ERROR;
1438 }
1439 }
1440
1441 ur->ur_mr = NULL;
1442 return OPAL_SUCCESS;
1443 }
1444
1445
1446
1447
1448
1449 static void module_async_event_callback(int fd, short flags, void *arg)
1450 {
1451 char *str = NULL;
1452 bool fatal = false;
1453 opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) arg;
1454 uint32_t event;
1455 struct fi_eq_entry entry;
1456
1457
1458 int ret = fi_eq_read(module->dom_eq, &event, &entry, sizeof(entry), 0);
1459 if (-FI_EAGAIN == ret) {
1460
1461 return;
1462 }
1463
1464 else if (ret != 0) {
1465 opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
1466 true,
1467 opal_process_info.nodename,
1468 module->linux_device_name,
1469 "fi_eq_read()", __FILE__, __LINE__,
1470 ret,
1471 "Failed to get domain event");
1472 fatal = true;
1473 }
1474
1475 else if (event == 42 ) {
1476 opal_memchecker_base_mem_defined(&event, sizeof(event));
1477 opal_memchecker_base_mem_defined(&entry, sizeof(entry));
1478 switch (entry.data) {
1479 case 0:
1480
1481
1482
1483
1484
1485 opal_output_verbose(10, USNIC_OUT,
1486 "btl:usnic: got LINK_UP on %s",
1487 module->linux_device_name);
1488 break;
1489
1490 case 1:
1491 str = "link down";
1492
1493
1494 default:
1495 if (NULL == str) {
1496 str = "unknown event";
1497 }
1498
1499
1500
1501
1502
1503 opal_show_help("help-mpi-btl-usnic.txt", "async event",
1504 true,
1505 opal_process_info.nodename,
1506 module->linux_device_name,
1507 str, entry.data);
1508 fatal = true;
1509 }
1510 }
1511
1512
1513
1514 if (fatal) {
1515 opal_btl_usnic_exit(module);
1516
1517 }
1518 }
1519
1520
1521
1522
1523 static int create_ep(opal_btl_usnic_module_t* module,
1524 struct opal_btl_usnic_channel_t *channel)
1525 {
1526 int rc;
1527 struct sockaddr_in *sin;
1528 size_t addrlen;
1529 struct fi_info *hint;
1530
1531 hint = fi_dupinfo(module->fabric_info);
1532 if (NULL == hint) {
1533 opal_show_help("help-mpi-btl-usnic.txt",
1534 "internal error during init",
1535 true,
1536 opal_process_info.nodename,
1537 module->linux_device_name,
1538 "fi_dupinfo() failed", __FILE__, __LINE__,
1539 -1, "Unknown");
1540 return OPAL_ERR_OUT_OF_RESOURCE;
1541 }
1542
1543 hint->rx_attr->size = channel->chan_rd_num;
1544 hint->tx_attr->size = channel->chan_sd_num;
1545
1546
1547 sin = hint->src_addr;
1548 if (0 == mca_btl_usnic_component.udp_port_base) {
1549 sin->sin_port = 0;
1550 } else {
1551 sin->sin_port = htons(mca_btl_usnic_component.udp_port_base +
1552 opal_process_info.my_local_rank);
1553 }
1554
1555 rc = fi_getinfo(module->libfabric_api, NULL, 0, 0, hint, &channel->info);
1556 fi_freeinfo(hint);
1557 if (0 != rc) {
1558 opal_show_help("help-mpi-btl-usnic.txt",
1559 "internal error during init",
1560 true,
1561 opal_process_info.nodename,
1562 module->linux_device_name,
1563 "fi_getinfo() failed", __FILE__, __LINE__,
1564 rc, fi_strerror(-rc));
1565 return OPAL_ERR_OUT_OF_RESOURCE;
1566 }
1567 if (channel->chan_index != USNIC_PRIORITY_CHANNEL) {
1568 channel->info->caps &= ~(1ULL << 63);
1569 }
1570
1571
1572
1573 #if !defined(NDEBUG)
1574
1575
1576
1577 assert(FI_SOCKADDR_IN == channel->info->addr_format ||
1578 FI_SOCKADDR == channel->info->addr_format);
1579 if (FI_SOCKADDR == channel->info->addr_format) {
1580 struct sockaddr *sa;
1581 sa = (struct sockaddr *)channel->info->src_addr;
1582 assert(AF_INET == sa->sa_family);
1583 }
1584 #endif
1585
1586 sin = (struct sockaddr_in *)channel->info->src_addr;
1587 assert(sizeof(struct sockaddr_in) == channel->info->src_addrlen);
1588
1589
1590 assert(0 == sin->sin_port);
1591
1592 rc = fi_endpoint(module->domain, channel->info, &channel->ep, NULL);
1593 if (0 != rc || NULL == channel->ep) {
1594 opal_show_help("help-mpi-btl-usnic.txt",
1595 "internal error during init",
1596 true,
1597 opal_process_info.nodename,
1598 module->linux_device_name,
1599 "fi_endpoint() failed", __FILE__, __LINE__,
1600 rc, fi_strerror(-rc));
1601 return OPAL_ERR_OUT_OF_RESOURCE;
1602 }
1603
1604
1605
1606 if ((int) channel->info->rx_attr->size < channel->chan_rd_num) {
1607 rc = FI_ETOOSMALL;
1608 opal_show_help("help-mpi-btl-usnic.txt",
1609 "internal error during init",
1610 true,
1611 opal_process_info.nodename,
1612 module->linux_device_name,
1613 "endpoint RX queue length is too short", __FILE__, __LINE__,
1614 rc, fi_strerror(rc));
1615 return OPAL_ERR_OUT_OF_RESOURCE;
1616 }
1617 if ((int) channel->info->tx_attr->size < channel->chan_sd_num) {
1618 rc = FI_ETOOSMALL;
1619 opal_show_help("help-mpi-btl-usnic.txt",
1620 "internal error during init",
1621 true,
1622 opal_process_info.nodename,
1623 module->linux_device_name,
1624 "endpoint TX queue length is too short", __FILE__, __LINE__,
1625 rc, fi_strerror(rc));
1626 return OPAL_ERR_OUT_OF_RESOURCE;
1627 }
1628
1629
1630 rc = fi_ep_bind(channel->ep, &channel->cq->fid, FI_SEND);
1631 if (0 != rc) {
1632 opal_show_help("help-mpi-btl-usnic.txt",
1633 "internal error during init",
1634 true,
1635 opal_process_info.nodename,
1636 module->linux_device_name,
1637 "fi_ep_bind() SCQ to EP failed", __FILE__, __LINE__,
1638 rc, fi_strerror(-rc));
1639 return OPAL_ERR_OUT_OF_RESOURCE;
1640 }
1641 rc = fi_ep_bind(channel->ep, &channel->cq->fid, FI_RECV);
1642 if (0 != rc) {
1643 opal_show_help("help-mpi-btl-usnic.txt",
1644 "internal error during init",
1645 true,
1646 opal_process_info.nodename,
1647 module->linux_device_name,
1648 "fi_ep_bind() RCQ to EP failed", __FILE__, __LINE__,
1649 rc, fi_strerror(-rc));
1650 return OPAL_ERR_OUT_OF_RESOURCE;
1651 }
1652 rc = fi_ep_bind(channel->ep, &module->av->fid, 0);
1653 if (0 != rc) {
1654 opal_show_help("help-mpi-btl-usnic.txt",
1655 "internal error during init",
1656 true,
1657 opal_process_info.nodename,
1658 module->linux_device_name,
1659 "fi_ep_bind() AV to EP failed", __FILE__, __LINE__,
1660 rc, fi_strerror(-rc));
1661 return OPAL_ERR_OUT_OF_RESOURCE;
1662 }
1663
1664
1665 rc = fi_enable(channel->ep);
1666 if (0 != rc) {
1667 opal_show_help("help-mpi-btl-usnic.txt",
1668 "internal error during init",
1669 true,
1670 opal_process_info.nodename,
1671 module->linux_device_name,
1672 "fi_enable() failed", __FILE__, __LINE__,
1673 rc, fi_strerror(-rc));
1674 return OPAL_ERR_OUT_OF_RESOURCE;
1675 }
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 if (0 == sin->sin_port) {
1686 addrlen = sizeof(struct sockaddr_in);
1687 rc = fi_getname(&channel->ep->fid, channel->info->src_addr, &addrlen);
1688 if (0 != rc) {
1689 opal_show_help("help-mpi-btl-usnic.txt",
1690 "internal error during init",
1691 true,
1692 opal_process_info.nodename,
1693 module->linux_device_name,
1694 "fi_getname() failed", __FILE__, __LINE__,
1695 rc, fi_strerror(-rc));
1696 return OPAL_ERR_OUT_OF_RESOURCE;
1697 }
1698 assert(0 != sin->sin_port);
1699 }
1700
1701 char *str;
1702 if (USNIC_PRIORITY_CHANNEL == channel->chan_index) {
1703 str = "priority";
1704 } else if (USNIC_DATA_CHANNEL == channel->chan_index) {
1705 str = "data";
1706 } else {
1707 str = "UNKNOWN";
1708 }
1709 opal_output_verbose(15, USNIC_OUT,
1710 "btl:usnic:create_ep:%s: new usnic local endpoint channel %s: %s:%d",
1711 module->linux_device_name,
1712 str,
1713 inet_ntoa(sin->sin_addr),
1714 ntohs(sin->sin_port));
1715
1716 return OPAL_SUCCESS;
1717 }
1718
1719
1720
1721
1722
1723 static void finalize_one_channel(opal_btl_usnic_module_t *module,
1724 struct opal_btl_usnic_channel_t *channel)
1725 {
1726 if (NULL != channel->ep) {
1727 fi_close(&channel->ep->fid);
1728 channel->ep = NULL;
1729 }
1730
1731
1732 if (NULL != channel->cq) {
1733 fi_close(&channel->cq->fid);
1734 channel->cq = NULL;
1735 }
1736
1737 if (NULL != channel->info) {
1738 fi_freeinfo(channel->info);
1739 channel->info = NULL;
1740 }
1741
1742
1743
1744
1745 if (channel->recv_segs.ctx == module) {
1746 assert(NULL == channel->ep && NULL == channel->cq);
1747 OBJ_DESTRUCT(&channel->recv_segs);
1748 }
1749 }
1750
1751
1752
1753
1754 static int init_one_channel(opal_btl_usnic_module_t *module,
1755 int index,
1756 int max_msg_size,
1757 int rd_num,
1758 int sd_num,
1759 int cq_num)
1760 {
1761 int i;
1762 int rc;
1763 uint32_t segsize;
1764 opal_btl_usnic_recv_segment_t *rseg;
1765 opal_free_list_item_t* item;
1766 struct opal_btl_usnic_channel_t *channel;
1767 struct fi_cq_attr cq_attr;
1768
1769 channel = &module->mod_channels[index];
1770 channel->chan_max_msg_size = max_msg_size;
1771 channel->chan_rd_num = rd_num;
1772 channel->chan_sd_num = sd_num;
1773 channel->chan_index = index;
1774 channel->chan_deferred_recv = NULL;
1775 channel->chan_error = false;
1776
1777 channel->fastsend_wqe_thresh = sd_num - 10;
1778
1779 channel->credits = sd_num;
1780 channel->rx_post_cnt = 0;
1781
1782
1783
1784
1785
1786 memset(&cq_attr, 0, sizeof(cq_attr));
1787 cq_attr.format = FI_CQ_FORMAT_CONTEXT;
1788 cq_attr.wait_obj = FI_WAIT_NONE;
1789 cq_attr.size = cq_num;
1790 rc = fi_cq_open(module->domain, &cq_attr, &channel->cq, NULL);
1791 if (0 != rc) {
1792 opal_show_help("help-mpi-btl-usnic.txt",
1793 "internal error during init",
1794 true,
1795 opal_process_info.nodename,
1796 module->linux_device_name,
1797 "failed to create CQ", __FILE__, __LINE__,
1798 rc, fi_strerror(-rc));
1799 goto error;
1800 }
1801
1802
1803
1804 if ((int) cq_attr.size < cq_num) {
1805 rc = FI_ETOOSMALL;
1806 opal_show_help("help-mpi-btl-usnic.txt",
1807 "internal error during init",
1808 true,
1809 opal_process_info.nodename,
1810 module->linux_device_name,
1811 "created CQ is too small", __FILE__, __LINE__,
1812 rc, fi_strerror(rc));
1813 goto error;
1814 }
1815
1816
1817 rc = create_ep(module, channel);
1818 if (OPAL_SUCCESS != rc) {
1819 goto error;
1820 }
1821
1822 assert(channel->info->ep_attr->msg_prefix_size ==
1823 (uint32_t) mca_btl_usnic_component.transport_header_len);
1824
1825 opal_output_verbose(15, USNIC_OUT,
1826 "btl:usnic:init_one_channel:%s: channel %s, rx queue size=%" PRIsize_t ", tx queue size=%" PRIsize_t ", cq size=%" PRIsize_t ", send credits=%d",
1827 module->linux_device_name,
1828 (index == USNIC_PRIORITY_CHANNEL) ? "priority" : "data",
1829 channel->info->rx_attr->size,
1830 channel->info->tx_attr->size,
1831 cq_attr.size,
1832 channel->credits);
1833
1834
1835
1836
1837
1838
1839 segsize = (max_msg_size + channel->info->ep_attr->msg_prefix_size +
1840 opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
1841 OBJ_CONSTRUCT(&channel->recv_segs, opal_free_list_t);
1842 rc =
1843 usnic_compat_free_list_init(&channel->recv_segs,
1844 sizeof(opal_btl_usnic_recv_segment_t) ,
1845 opal_cache_line_size ,
1846 OBJ_CLASS(opal_btl_usnic_recv_segment_t),
1847 segsize ,
1848 opal_cache_line_size ,
1849 rd_num ,
1850 rd_num ,
1851 rd_num ,
1852 module->super.btl_mpool ,
1853 0 ,
1854 module->rcache ,
1855 NULL ,
1856 NULL );
1857 channel->recv_segs.ctx = module;
1858
1859
1860
1861 if (OPAL_SUCCESS != rc) {
1862 goto error;
1863 }
1864
1865
1866 for (i = 0; i < rd_num; i++) {
1867 USNIC_COMPAT_FREE_LIST_GET(&channel->recv_segs, item);
1868 assert(NULL != item);
1869 rseg = (opal_btl_usnic_recv_segment_t*)item;
1870
1871 if (NULL == rseg) {
1872 opal_show_help("help-mpi-btl-usnic.txt",
1873 "internal error during init",
1874 true,
1875 opal_process_info.nodename,
1876 module->linux_device_name,
1877 "Failed to get receive buffer from freelist",
1878 __FILE__, __LINE__);
1879 goto error;
1880 }
1881
1882
1883 rseg->rs_len = segsize;
1884
1885 rc = fi_recv(channel->ep, rseg->rs_protocol_header, segsize,
1886 NULL, FI_ADDR_UNSPEC, rseg);
1887 if (0 != rc) {
1888 opal_show_help("help-mpi-btl-usnic.txt",
1889 "internal error during init",
1890 true,
1891 opal_process_info.nodename,
1892 module->linux_device_name,
1893 "Failed to post receive buffer",
1894 __FILE__, __LINE__);
1895 goto error;
1896 }
1897 }
1898
1899 return OPAL_SUCCESS;
1900
1901 error:
1902 finalize_one_channel(module, channel);
1903 return OPAL_ERROR;
1904 }
1905
1906
1907
1908
1909 static opal_btl_usnic_seq_t
1910 get_initial_seq_no(void)
1911 {
1912 opal_btl_usnic_seq_t isn;
1913
1914 isn = (opal_btl_usnic_seq_t)opal_rand(&opal_btl_usnic_rand_buff);
1915
1916 return isn;
1917 }
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927 static void init_module_globals(opal_btl_usnic_module_t *module)
1928 {
1929 OBJ_CONSTRUCT(&module->all_endpoints_lock, opal_mutex_t);
1930 }
1931
1932
1933
1934
1935
1936 static void init_local_modex_part1(opal_btl_usnic_module_t *module)
1937 {
1938
1939
1940
1941 opal_btl_usnic_modex_t *modex = &module->local_modex;
1942 struct fi_info *info = module->fabric_info;
1943 struct fi_usnic_info *uip = &module->usnic_info;
1944 struct sockaddr_in *sin;
1945
1946 sin = info->src_addr;
1947 modex->ipv4_addr = sin->sin_addr.s_addr;
1948 modex->netmask = uip->ui.v1.ui_netmask_be;
1949 modex->max_msg_size = info->ep_attr->max_msg_size;
1950 modex->link_speed_mbps = uip->ui.v1.ui_link_speed;
1951
1952 opal_btl_usnic_snprintf_ipv4_addr(module->if_ipv4_addr_str,
1953 sizeof(module->if_ipv4_addr_str),
1954 modex->ipv4_addr,
1955 modex->netmask);
1956
1957 opal_output_verbose(5, USNIC_OUT,
1958 "btl:usnic: %s IP charactertics: %s, %u Mbps",
1959 module->linux_device_name,
1960 module->if_ipv4_addr_str,
1961 modex->link_speed_mbps);
1962 }
1963
1964
1965
1966
1967
1968
1969
1970
1971 static void init_find_transport_header_len(opal_btl_usnic_module_t *module)
1972 {
1973 mca_btl_usnic_component.transport_header_len =
1974 module->fabric_info->ep_attr->msg_prefix_size;
1975 mca_btl_usnic_component.transport_protocol =
1976 module->fabric_info->ep_attr->protocol;
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995 uint32_t libfabric_api;
1996 libfabric_api = fi_version();
1997 if (1 == FI_MAJOR(libfabric_api) &&
1998 0 == FI_MINOR(libfabric_api)) {
1999 mca_btl_usnic_component.prefix_send_offset = 0;
2000 } else {
2001 mca_btl_usnic_component.prefix_send_offset =
2002 module->fabric_info->ep_attr->msg_prefix_size;
2003 }
2004 }
2005
2006
2007
2008
2009 static void init_queue_lengths(opal_btl_usnic_module_t *module)
2010 {
2011 bool cq_is_sum = false;
2012 if (-1 == mca_btl_usnic_component.cq_num) {
2013 cq_is_sum = true;
2014 }
2015
2016 if (-1 == mca_btl_usnic_component.sd_num) {
2017 module->sd_num = module->fabric_info->tx_attr->size;
2018 } else {
2019 module->sd_num = mca_btl_usnic_component.sd_num;
2020 }
2021 if (-1 == mca_btl_usnic_component.rd_num) {
2022 module->rd_num = module->fabric_info->rx_attr->size;
2023 } else {
2024 module->rd_num = mca_btl_usnic_component.rd_num;
2025 }
2026 if (cq_is_sum) {
2027 module->cq_num = module->rd_num + module->sd_num;
2028 } else {
2029 module->cq_num = mca_btl_usnic_component.cq_num;
2030 }
2031 module->av_eq_num = mca_btl_usnic_component.av_eq_num;
2032
2033
2034
2035
2036
2037
2038
2039
2040 if (-1 == mca_btl_usnic_component.prio_sd_num) {
2041 module->prio_sd_num = max(128, 32 * USNIC_MCW_SIZE) - 1;
2042 } else {
2043 module->prio_sd_num = mca_btl_usnic_component.prio_sd_num;
2044 }
2045 if (module->prio_sd_num > 0 &&
2046 (unsigned) module->prio_sd_num >
2047 module->fabric_info->tx_attr->size) {
2048 module->prio_sd_num = module->fabric_info->tx_attr->size;
2049 }
2050 if (-1 == mca_btl_usnic_component.prio_rd_num) {
2051 module->prio_rd_num =
2052 max(128, 32 * USNIC_MCW_SIZE) - 1;
2053 } else {
2054 module->prio_rd_num = mca_btl_usnic_component.prio_rd_num;
2055 }
2056 if (module->prio_rd_num > 0 &&
2057 (unsigned) module->prio_rd_num >
2058 module->fabric_info->rx_attr->size) {
2059 module->prio_rd_num = module->fabric_info->rx_attr->size;
2060 }
2061 if (cq_is_sum) {
2062 module->prio_cq_num = module->prio_rd_num + module->prio_sd_num;
2063 } else {
2064 module->prio_cq_num = module->cq_num;
2065 }
2066 }
2067
2068 static void init_payload_lengths(opal_btl_usnic_module_t *module)
2069 {
2070
2071 module->max_frag_payload =
2072 module->local_modex.max_msg_size -
2073 sizeof(opal_btl_usnic_btl_header_t) -
2074
2075 mca_btl_usnic_component.prefix_send_offset;
2076
2077
2078 module->max_chunk_payload =
2079 module->local_modex.max_msg_size -
2080 sizeof(opal_btl_usnic_btl_chunk_header_t) -
2081 mca_btl_usnic_component.prefix_send_offset;
2082
2083
2084 if (0 == module->max_tiny_msg_size) {
2085 module->max_tiny_msg_size = 768;
2086 }
2087 module->max_tiny_payload = module->max_tiny_msg_size -
2088 sizeof(opal_btl_usnic_btl_header_t);
2089 }
2090
2091 static void init_pml_values(opal_btl_usnic_module_t *module)
2092 {
2093 module->super.btl_bandwidth = module->local_modex.link_speed_mbps;
2094
2095
2096 if (0 == module->super.btl_rndv_eager_limit) {
2097 module->super.btl_rndv_eager_limit = USNIC_DFLT_RNDV_EAGER_LIMIT;
2098 }
2099
2100
2101 if (0 == module->super.btl_eager_limit) {
2102
2103 if (1 == mca_btl_usnic_component.num_modules) {
2104 module->super.btl_eager_limit =
2105 USNIC_DFLT_EAGER_LIMIT_1DEVICE;
2106 } else {
2107 module->super.btl_eager_limit =
2108 USNIC_DFLT_EAGER_LIMIT_NDEVICES;
2109 }
2110 }
2111
2112
2113
2114 module->super.btl_max_send_size =
2115 module->super.btl_eager_limit;
2116
2117 #if BTL_VERSION == 30
2118 module->super.btl_put_limit =
2119 module->super.btl_eager_limit;
2120 #endif
2121 }
2122
2123 static void init_senders(opal_btl_usnic_module_t *module)
2124 {
2125
2126 OBJ_CONSTRUCT(&module->senders, opal_hash_table_t);
2127
2128
2129
2130
2131 opal_hash_table_init(&module->senders, 4096);
2132 }
2133
2134 static void init_connectivity_checker(opal_btl_usnic_module_t *module)
2135 {
2136
2137 int rc = opal_btl_usnic_connectivity_listen(module);
2138 if (OPAL_SUCCESS != rc) {
2139 OPAL_ERROR_LOG(rc);
2140 opal_btl_usnic_util_abort("Failed to notify connectivity agent to listen",
2141 __FILE__, __LINE__);
2142 }
2143 }
2144
2145 static void init_hwloc(opal_btl_usnic_module_t *module)
2146 {
2147
2148
2149 if (mca_btl_usnic_component.want_numa_device_assignment) {
2150 opal_btl_usnic_hwloc_distance(module);
2151 } else {
2152 opal_output_verbose(5, USNIC_OUT,
2153 "btl:usnic: not sorting devices by NUMA distance (MCA btl_usnic_want_numa_device_assignment)");
2154 }
2155 }
2156
2157 static void init_procs(opal_btl_usnic_module_t *module)
2158 {
2159
2160
2161 OBJ_CONSTRUCT(&module->all_procs, opal_pointer_array_t);
2162 opal_pointer_array_init(&module->all_procs, USNIC_MCW_SIZE, INT_MAX, 32);
2163 }
2164
2165
2166
2167
2168 static int init_mpool(opal_btl_usnic_module_t *module)
2169 {
2170 struct mca_rcache_base_resources_t rcache_resources;
2171
2172 rcache_resources.reg_data = (void*)module;
2173 rcache_resources.sizeof_reg = sizeof(opal_btl_usnic_reg_t);
2174 rcache_resources.register_mem = usnic_reg_mr;
2175 rcache_resources.deregister_mem = usnic_dereg_mr;
2176 rcache_resources.cache_name = mca_btl_usnic_component.usnic_rcache_name;
2177 module->rcache =
2178 mca_rcache_base_module_create (mca_btl_usnic_component.usnic_rcache_name,
2179 &module->super, &rcache_resources);
2180 if (NULL == module->rcache) {
2181 opal_show_help("help-mpi-btl-usnic.txt",
2182 "internal error during init",
2183 true,
2184 opal_process_info.nodename,
2185 module->linux_device_name,
2186 "create rcache", __FILE__, __LINE__);
2187 return OPAL_ERROR;
2188 }
2189 module->super.btl_mpool =
2190 mca_mpool_base_module_lookup (mca_btl_usnic_component.usnic_mpool_hints);
2191 if (NULL == module->super.btl_mpool) {
2192 opal_show_help("help-mpi-btl-usnic.txt",
2193 "internal error during init",
2194 true,
2195 opal_process_info.nodename,
2196 module->linux_device_name,
2197 "create mpool", __FILE__, __LINE__);
2198 return OPAL_ERROR;
2199 }
2200
2201 return OPAL_SUCCESS;
2202 }
2203
2204 static int init_channels(opal_btl_usnic_module_t *module)
2205 {
2206 int rc;
2207 struct fi_av_attr av_attr;
2208 struct fi_eq_attr eq_attr;
2209
2210 memset(&module->mod_channels[0], 0,
2211 sizeof(module->mod_channels[0]));
2212 memset(&module->mod_channels[1], 0,
2213 sizeof(module->mod_channels[1]));
2214
2215 memset(&av_attr, 0, sizeof(av_attr));
2216 av_attr.type = FI_AV_MAP;
2217 av_attr.flags = FI_EVENT;
2218 rc = fi_av_open(module->domain, &av_attr, &module->av, NULL);
2219 if (rc != OPAL_SUCCESS) {
2220 goto destroy;
2221 }
2222
2223 rc = fi_open_ops(&module->av->fid, FI_USNIC_AV_OPS_1, 0,
2224 (void **)&module->usnic_av_ops, NULL);
2225 if (rc != OPAL_SUCCESS) {
2226 goto destroy;
2227 }
2228
2229 memset(&eq_attr, 0, sizeof(eq_attr));
2230 eq_attr.size = module->av_eq_num;
2231 eq_attr.wait_obj = FI_WAIT_UNSPEC;
2232 rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
2233 if (rc != OPAL_SUCCESS) {
2234 goto destroy;
2235 }
2236
2237 module->av_eq_size = eq_attr.size;
2238
2239 eq_attr.wait_obj = FI_WAIT_FD;
2240 rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
2241 if (rc != OPAL_SUCCESS) {
2242 goto destroy;
2243 }
2244
2245 rc = fi_av_bind(module->av, &module->av_eq->fid, 0);
2246 if (rc != OPAL_SUCCESS) {
2247 goto destroy;
2248 }
2249
2250 rc = fi_domain_bind(module->domain, &module->dom_eq->fid, 0);
2251 if (rc != OPAL_SUCCESS) {
2252 goto destroy;
2253 }
2254
2255
2256 rc = init_one_channel(module,
2257 USNIC_PRIORITY_CHANNEL,
2258 module->max_tiny_msg_size,
2259 module->prio_rd_num, module->prio_sd_num, module->prio_cq_num);
2260 if (rc != OPAL_SUCCESS) {
2261 goto destroy;
2262 }
2263 rc = init_one_channel(module,
2264 USNIC_DATA_CHANNEL,
2265 module->fabric_info->ep_attr->max_msg_size,
2266 module->rd_num, module->sd_num, module->cq_num);
2267 if (rc != OPAL_SUCCESS) {
2268 goto destroy;
2269 }
2270
2271 return OPAL_SUCCESS;
2272
2273 destroy:
2274 finalize_one_channel(module,
2275 &module->mod_channels[USNIC_DATA_CHANNEL]);
2276 finalize_one_channel(module,
2277 &module->mod_channels[USNIC_PRIORITY_CHANNEL]);
2278
2279 return rc;
2280 }
2281
2282
2283
2284 static void init_local_modex_part2(opal_btl_usnic_module_t *module)
2285 {
2286 module->local_modex.isn = get_initial_seq_no();
2287
2288
2289 for (int id = 0; id < USNIC_NUM_CHANNELS; ++id) {
2290 opal_btl_usnic_channel_t *channel = &module->mod_channels[id];
2291 struct sockaddr_in *sin;
2292 sin = channel->info->src_addr;
2293 module->local_modex.ports[id] = ntohs(sin->sin_port);
2294 module->local_modex.protocol = channel->info->ep_attr->protocol;
2295 }
2296 }
2297
2298 static void init_async_event(opal_btl_usnic_module_t *module)
2299 {
2300 int fd;
2301 int ret;
2302
2303 ret = fi_control(&module->dom_eq->fid, FI_GETWAIT, &fd);
2304 if (ret != 0) {
2305 opal_show_help("help-mpi-btl-usnic.txt",
2306 "libfabric API failed",
2307 true,
2308 opal_process_info.nodename,
2309 module->linux_device_name,
2310 "fi_control(eq, FI_GETWAIT)", __FILE__, __LINE__,
2311 ret,
2312 fi_strerror(-ret));
2313 return;
2314 }
2315
2316
2317
2318 opal_event_set(opal_sync_event_base, &(module->device_async_event), fd,
2319 OPAL_EV_READ | OPAL_EV_PERSIST,
2320 module_async_event_callback, module);
2321 opal_event_add(&(module->device_async_event), NULL);
2322 module->device_async_event_active = true;
2323 }
2324
2325 static void init_random_objects(opal_btl_usnic_module_t *module)
2326 {
2327
2328 opal_mutex_lock(&module->all_endpoints_lock);
2329 OBJ_CONSTRUCT(&(module->all_endpoints), opal_list_t);
2330 module->all_endpoints_constructed = true;
2331 opal_mutex_unlock(&module->all_endpoints_lock);
2332
2333
2334 OBJ_CONSTRUCT(&module->pending_resend_segs, opal_list_t);
2335 OBJ_CONSTRUCT(&module->endpoints_that_need_acks, opal_list_t);
2336
2337
2338 OBJ_CONSTRUCT(&module->endpoints_with_sends, opal_list_t);
2339 }
2340
2341 static void init_freelists(opal_btl_usnic_module_t *module)
2342 {
2343 int rc __opal_attribute_unused__;
2344 uint32_t segsize;
2345
2346 segsize = (module->local_modex.max_msg_size +
2347 opal_cache_line_size - 1) &
2348 ~(opal_cache_line_size - 1);
2349
2350
2351 OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t);
2352 rc = usnic_compat_free_list_init(&module->small_send_frags,
2353 sizeof(opal_btl_usnic_small_send_frag_t) +
2354 mca_btl_usnic_component.prefix_send_offset,
2355 opal_cache_line_size,
2356 OBJ_CLASS(opal_btl_usnic_small_send_frag_t),
2357 segsize,
2358 opal_cache_line_size,
2359 module->sd_num * 4,
2360 -1,
2361 module->sd_num / 2,
2362 module->super.btl_mpool,
2363 0 ,
2364 module->rcache,
2365 NULL ,
2366 NULL );
2367 assert(OPAL_SUCCESS == rc);
2368
2369 OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t);
2370 rc = usnic_compat_free_list_init(&module->large_send_frags,
2371 sizeof(opal_btl_usnic_large_send_frag_t) +
2372 mca_btl_usnic_component.prefix_send_offset,
2373 opal_cache_line_size,
2374 OBJ_CLASS(opal_btl_usnic_large_send_frag_t),
2375 0,
2376 0,
2377 module->sd_num / 8,
2378 -1,
2379 module->sd_num / 8,
2380 NULL,
2381 0 ,
2382 NULL ,
2383 NULL ,
2384 NULL );
2385 assert(OPAL_SUCCESS == rc);
2386
2387 OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t);
2388 rc = usnic_compat_free_list_init(&module->put_dest_frags,
2389 sizeof(opal_btl_usnic_put_dest_frag_t) +
2390 mca_btl_usnic_component.prefix_send_offset,
2391 opal_cache_line_size,
2392 OBJ_CLASS(opal_btl_usnic_put_dest_frag_t),
2393 0,
2394 0,
2395 module->sd_num / 8,
2396 -1,
2397 module->sd_num / 8,
2398 NULL,
2399 0 ,
2400 NULL ,
2401 NULL ,
2402 NULL );
2403 assert(OPAL_SUCCESS == rc);
2404
2405
2406 OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t);
2407 rc = usnic_compat_free_list_init(&module->chunk_segs,
2408 sizeof(opal_btl_usnic_chunk_segment_t) +
2409 mca_btl_usnic_component.prefix_send_offset,
2410 opal_cache_line_size,
2411 OBJ_CLASS(opal_btl_usnic_chunk_segment_t),
2412 segsize,
2413 opal_cache_line_size,
2414 module->sd_num * 4,
2415 -1,
2416 module->sd_num / 2,
2417 module->super.btl_mpool,
2418 0 ,
2419 module->rcache,
2420 NULL ,
2421 NULL );
2422 assert(OPAL_SUCCESS == rc);
2423
2424
2425 uint32_t ack_segment_len;
2426 ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) +
2427 opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
2428 OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t);
2429 rc = usnic_compat_free_list_init(&module->ack_segs,
2430 sizeof(opal_btl_usnic_ack_segment_t) +
2431 mca_btl_usnic_component.prefix_send_offset,
2432 opal_cache_line_size,
2433 OBJ_CLASS(opal_btl_usnic_ack_segment_t),
2434 ack_segment_len,
2435 opal_cache_line_size,
2436 module->sd_num * 4,
2437 -1,
2438 module->sd_num / 2,
2439 module->super.btl_mpool,
2440 0 ,
2441 module->rcache,
2442 NULL ,
2443 NULL );
2444 assert(OPAL_SUCCESS == rc);
2445
2446
2447
2448
2449
2450
2451
2452 module->first_pool = 16;
2453 module->last_pool = usnic_fls(module->super.btl_eager_limit-1);
2454 module->module_recv_buffers = calloc(module->last_pool+1,
2455 sizeof(opal_free_list_t));
2456 assert(module->module_recv_buffers != NULL);
2457 for (int i = module->first_pool; i <= module->last_pool; ++i) {
2458 size_t elt_size = sizeof(opal_btl_usnic_rx_buf_t) - 1 + (1 << i);
2459 OBJ_CONSTRUCT(&module->module_recv_buffers[i], opal_free_list_t);
2460 rc = usnic_compat_free_list_init(&module->module_recv_buffers[i],
2461 elt_size,
2462 opal_cache_line_size,
2463 OBJ_CLASS(opal_btl_usnic_rx_buf_t),
2464 0,
2465 0,
2466 128,
2467 128,
2468 128,
2469 NULL ,
2470 0 ,
2471 NULL ,
2472 NULL ,
2473 NULL );
2474 assert(OPAL_SUCCESS == rc);
2475 }
2476 }
2477
2478
2479
2480
2481
2482 int opal_btl_usnic_module_init(opal_btl_usnic_module_t *module)
2483 {
2484 init_module_globals(module);
2485 init_local_modex_part1(module);
2486 init_find_transport_header_len(module);
2487 init_queue_lengths(module);
2488 init_payload_lengths(module);
2489 init_pml_values(module);
2490 init_senders(module);
2491 init_connectivity_checker(module);
2492 init_hwloc(module);
2493 init_procs(module);
2494
2495 int ret;
2496 if (OPAL_SUCCESS != (ret = init_mpool(module)) ||
2497 OPAL_SUCCESS != (ret = init_channels(module))) {
2498 mca_rcache_base_module_destroy (module->rcache);
2499 return ret;
2500 }
2501
2502 init_local_modex_part2(module);
2503 init_async_event(module);
2504 init_random_objects(module);
2505 init_freelists(module);
2506 opal_btl_usnic_stats_init(module);
2507
2508
2509
2510 if (mca_btl_usnic_component.connectivity_enabled) {
2511 int rc = opal_btl_usnic_connectivity_listen(module);
2512 if (OPAL_SUCCESS != rc) {
2513 OPAL_ERROR_LOG(rc);
2514 opal_btl_usnic_util_abort("Failed to notify connectivity agent to listen",
2515 __FILE__, __LINE__);
2516 }
2517 } else {
2518
2519
2520 module->local_modex.connectivity_udp_port = 0;
2521 }
2522
2523 return OPAL_SUCCESS;
2524 }
2525
2526
2527 static int usnic_ft_event(int state)
2528 {
2529 return OPAL_SUCCESS;
2530 }
2531
2532
2533 opal_btl_usnic_module_t opal_btl_usnic_module_template = {
2534 .super = {
2535 .btl_component = &mca_btl_usnic_component.super,
2536
2537 #if BTL_VERSION == 20
2538 .btl_prepare_dst = opal_btl_usnic_prepare_dst,
2539 .btl_seg_size = sizeof(mca_btl_base_segment_t),
2540 #elif BTL_VERSION == 30
2541 .btl_atomic_flags = 0,
2542 .btl_registration_handle_size = 0,
2543
2544 .btl_get_limit = 0,
2545 .btl_get_alignment = 0,
2546 .btl_put_limit = 0,
2547 .btl_put_alignment = 0,
2548
2549 .btl_atomic_op = NULL,
2550 .btl_atomic_fop = NULL,
2551 .btl_atomic_cswap = NULL,
2552 #endif
2553
2554 .btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT,
2555 .btl_flags =
2556 MCA_BTL_FLAGS_SEND |
2557 MCA_BTL_FLAGS_SEND_INPLACE |
2558
2559
2560
2561 MCA_BTL_FLAGS_SINGLE_ADD_PROCS,
2562
2563 .btl_add_procs = usnic_add_procs,
2564 .btl_del_procs = usnic_del_procs,
2565 .btl_register = NULL,
2566 .btl_finalize = usnic_finalize,
2567
2568 .btl_alloc = usnic_alloc,
2569 .btl_free = usnic_free,
2570 .btl_prepare_src = opal_btl_usnic_prepare_src,
2571 .btl_send = usnic_send,
2572 .btl_sendi = NULL,
2573 .btl_put = opal_btl_usnic_put,
2574 .btl_get = NULL,
2575 .btl_dump = mca_btl_base_dump,
2576
2577 .btl_mpool = NULL,
2578 .btl_register_error = usnic_register_pml_err_cb,
2579 .btl_ft_event = usnic_ft_event
2580 }
2581 };