This source file includes following definitions.
- mca_pml_ob1_enable
- mca_pml_ob1_add_comm
- mca_pml_ob1_del_comm
- mca_pml_ob1_add_procs
- mca_pml_ob1_del_procs
- mca_pml_ob1_dump_hdr
- mca_pml_ob1_dump_frag_list
- mca_pml_ob1_dump_cant_match
- mca_pml_ob1_dump
- mca_pml_ob1_fin_completion
- mca_pml_ob1_send_fin
- mca_pml_ob1_process_pending_packets
- mca_pml_ob1_process_pending_rdma
- mca_pml_ob1_error_handler
- mca_pml_ob1_ft_event
- mca_pml_ob1_ft_event
- mca_pml_ob1_com_btl_comp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 #include "ompi_config.h"
31
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "opal/class/opal_bitmap.h"
36 #include "opal/util/output.h"
37 #include "opal/util/show_help.h"
38 #include "opal_stdint.h"
39 #include "opal/mca/btl/btl.h"
40 #include "opal/mca/btl/base/base.h"
41
42 #include "ompi/mca/pml/pml.h"
43 #include "ompi/mca/pml/base/base.h"
44 #include "ompi/mca/pml/base/base.h"
45 #include "ompi/mca/bml/base/base.h"
46 #include "opal/mca/pmix/pmix.h"
47 #include "ompi/runtime/ompi_cr.h"
48
49 #include "pml_ob1.h"
50 #include "pml_ob1_component.h"
51 #include "pml_ob1_comm.h"
52 #include "pml_ob1_hdr.h"
53 #include "pml_ob1_recvfrag.h"
54 #include "pml_ob1_sendreq.h"
55 #include "pml_ob1_recvreq.h"
56 #include "pml_ob1_rdmafrag.h"
57
58 mca_pml_ob1_t mca_pml_ob1 = {
59 {
60 mca_pml_ob1_add_procs,
61 mca_pml_ob1_del_procs,
62 mca_pml_ob1_enable,
63 NULL,
64 mca_pml_ob1_add_comm,
65 mca_pml_ob1_del_comm,
66 mca_pml_ob1_irecv_init,
67 mca_pml_ob1_irecv,
68 mca_pml_ob1_recv,
69 mca_pml_ob1_isend_init,
70 mca_pml_ob1_isend,
71 mca_pml_ob1_send,
72 mca_pml_ob1_iprobe,
73 mca_pml_ob1_probe,
74 mca_pml_ob1_start,
75 mca_pml_ob1_improbe,
76 mca_pml_ob1_mprobe,
77 mca_pml_ob1_imrecv,
78 mca_pml_ob1_mrecv,
79 mca_pml_ob1_dump,
80 mca_pml_ob1_ft_event,
81 65535,
82 INT_MAX
83 }
84 };
85
86 #if OPAL_CUDA_SUPPORT
87 extern void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl,
88 int32_t flags, ompi_proc_t* errproc,
89 char* btlinfo);
90 #endif
91
92 void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
93 int32_t flags, opal_proc_t* errproc,
94 char* btlinfo );
95
96 int mca_pml_ob1_enable(bool enable)
97 {
98 if( false == enable ) {
99 return OMPI_SUCCESS;
100 }
101
102 OBJ_CONSTRUCT(&mca_pml_ob1.lock, opal_mutex_t);
103
104
105 OBJ_CONSTRUCT(&mca_pml_ob1.rdma_frags, opal_free_list_t);
106 opal_free_list_init ( &mca_pml_ob1.rdma_frags,
107 sizeof(mca_pml_ob1_rdma_frag_t),
108 opal_cache_line_size,
109 OBJ_CLASS(mca_pml_ob1_rdma_frag_t),
110 0,opal_cache_line_size,
111 mca_pml_ob1.free_list_num,
112 mca_pml_ob1.free_list_max,
113 mca_pml_ob1.free_list_inc,
114 NULL, 0, NULL, NULL, NULL);
115
116 OBJ_CONSTRUCT(&mca_pml_ob1.recv_frags, opal_free_list_t);
117
118 opal_free_list_init ( &mca_pml_ob1.recv_frags,
119 sizeof(mca_pml_ob1_recv_frag_t) + mca_pml_ob1.unexpected_limit,
120 opal_cache_line_size,
121 OBJ_CLASS(mca_pml_ob1_recv_frag_t),
122 0,opal_cache_line_size,
123 mca_pml_ob1.free_list_num,
124 mca_pml_ob1.free_list_max,
125 mca_pml_ob1.free_list_inc,
126 NULL, 0, NULL, NULL, NULL);
127
128 OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, opal_free_list_t);
129 opal_free_list_init ( &mca_pml_ob1.pending_pckts,
130 sizeof(mca_pml_ob1_pckt_pending_t),
131 opal_cache_line_size,
132 OBJ_CLASS(mca_pml_ob1_pckt_pending_t),
133 0,opal_cache_line_size,
134 mca_pml_ob1.free_list_num,
135 mca_pml_ob1.free_list_max,
136 mca_pml_ob1.free_list_inc,
137 NULL, 0, NULL, NULL, NULL);
138
139
140 OBJ_CONSTRUCT(&mca_pml_ob1.buffers, opal_free_list_t);
141 OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, opal_free_list_t);
142 opal_free_list_init ( &mca_pml_ob1.send_ranges,
143 sizeof(mca_pml_ob1_send_range_t) +
144 sizeof(mca_pml_ob1_com_btl_t[mca_pml_ob1.max_send_per_range]),
145 opal_cache_line_size,
146 OBJ_CLASS(mca_pml_ob1_send_range_t),
147 0,opal_cache_line_size,
148 mca_pml_ob1.free_list_num,
149 mca_pml_ob1.free_list_max,
150 mca_pml_ob1.free_list_inc,
151 NULL, 0, NULL, NULL, NULL);
152
153
154 OBJ_CONSTRUCT(&mca_pml_ob1.send_pending, opal_list_t);
155 OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t);
156 OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t);
157 OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t);
158
159
160 OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending, opal_list_t);
161
162
163
164
165
166
167 opal_free_list_init ( &mca_pml_base_send_requests,
168 sizeof(mca_pml_ob1_send_request_t) +
169 sizeof(mca_pml_ob1_com_btl_t[mca_pml_ob1.max_rdma_per_request]),
170 opal_cache_line_size,
171 OBJ_CLASS(mca_pml_ob1_send_request_t),
172 0,opal_cache_line_size,
173 mca_pml_ob1.free_list_num,
174 mca_pml_ob1.free_list_max,
175 mca_pml_ob1.free_list_inc,
176 NULL, 0, NULL, NULL, NULL);
177
178 opal_free_list_init ( &mca_pml_base_recv_requests,
179 sizeof(mca_pml_ob1_recv_request_t) +
180 sizeof(mca_pml_ob1_com_btl_t[mca_pml_ob1.max_rdma_per_request]),
181 opal_cache_line_size,
182 OBJ_CLASS(mca_pml_ob1_recv_request_t),
183 0,opal_cache_line_size,
184 mca_pml_ob1.free_list_num,
185 mca_pml_ob1.free_list_max,
186 mca_pml_ob1.free_list_inc,
187 NULL, 0, NULL, NULL, NULL);
188
189 mca_pml_ob1.enabled = true;
190 return OMPI_SUCCESS;
191 }
192
193 int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
194 {
195
196 mca_pml_ob1_comm_t* pml_comm = OBJ_NEW(mca_pml_ob1_comm_t);
197 mca_pml_ob1_recv_frag_t *frag, *next_frag;
198 mca_pml_ob1_comm_proc_t* pml_proc;
199 mca_pml_ob1_match_hdr_t* hdr;
200
201 if (NULL == pml_comm) {
202 return OMPI_ERR_OUT_OF_RESOURCE;
203 }
204
205
206 if (comm->c_contextid > mca_pml_ob1.super.pml_max_contextid) {
207 OBJ_RELEASE(pml_comm);
208 return OMPI_ERR_OUT_OF_RESOURCE;
209 }
210
211 ompi_comm_assert_subscribe (comm, OMPI_COMM_ASSERT_NO_ANY_SOURCE);
212 ompi_comm_assert_subscribe (comm, OMPI_COMM_ASSERT_ALLOW_OVERTAKE);
213
214 mca_pml_ob1_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
215 comm->c_pml_comm = pml_comm;
216
217
218 OPAL_LIST_FOREACH_SAFE(frag, next_frag, &mca_pml_ob1.non_existing_communicator_pending, mca_pml_ob1_recv_frag_t) {
219 hdr = &frag->hdr.hdr_match;
220
221
222 if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
223 continue;
224
225
226
227
228 opal_list_remove_item (&mca_pml_ob1.non_existing_communicator_pending,
229 (opal_list_item_t *) frag);
230
231
232
233
234
235
236
237 PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
238 hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
239
240
241
242
243
244
245
246 pml_proc = mca_pml_ob1_peer_lookup(comm, hdr->hdr_src);
247
248 if (OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm)) {
249 #if !MCA_PML_OB1_CUSTOM_MATCH
250 opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
251 #else
252 custom_match_umq_append(pml_comm->umq, hdr->hdr_tag, hdr->hdr_src, frag);
253 #endif
254 PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
255 hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
256 continue;
257 }
258
259 if (((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
260
261 add_fragment_to_unexpected:
262
263 pml_proc->expected_sequence++;
264 #if !MCA_PML_OB1_CUSTOM_MATCH
265 opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
266 #else
267 custom_match_umq_append(pml_comm->umq, hdr->hdr_tag, hdr->hdr_src, frag);
268 #endif
269 PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
270 hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
271
272
273
274
275
276
277 if( NULL != pml_proc->frags_cant_match ) {
278 frag = check_cantmatch_for_match(pml_proc);
279 if( NULL != frag ) {
280 hdr = &frag->hdr.hdr_match;
281 goto add_fragment_to_unexpected;
282 }
283 }
284 } else {
285 append_frag_to_ordered_list(&pml_proc->frags_cant_match, frag,
286 pml_proc->expected_sequence);
287 }
288 }
289 return OMPI_SUCCESS;
290 }
291
292 int mca_pml_ob1_del_comm(ompi_communicator_t* comm)
293 {
294 OBJ_RELEASE(comm->c_pml_comm);
295 comm->c_pml_comm = NULL;
296 return OMPI_SUCCESS;
297 }
298
299
300
301
302
303
304
305
306 int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
307 {
308 mca_btl_base_selected_module_t *sm;
309 opal_bitmap_t reachable;
310 int rc;
311
312 if(nprocs == 0)
313 return OMPI_SUCCESS;
314
315 OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
316 rc = opal_bitmap_init(&reachable, (int)nprocs);
317 if(OMPI_SUCCESS != rc)
318 return rc;
319
320
321
322
323
324
325
326 #if OPAL_ENABLE_FT_CR == 0
327
328 if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("ob1",
329 procs,
330 nprocs))) {
331 return rc;
332 }
333 #endif
334
335 rc = mca_bml.bml_add_procs( nprocs,
336 procs,
337 &reachable );
338 if(OMPI_SUCCESS != rc)
339 goto cleanup_and_return;
340
341
342
343
344
345
346
347
348
349
350
351
352 OPAL_LIST_FOREACH(sm, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
353 if ((MCA_BTL_FLAGS_SEND & sm->btl_module->btl_flags) && sm->btl_module->btl_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
354 opal_show_help("help-mpi-pml-ob1.txt", "eager_limit_too_small",
355 true,
356 sm->btl_component->btl_version.mca_component_name,
357 ompi_process_info.nodename,
358 sm->btl_component->btl_version.mca_component_name,
359 sm->btl_module->btl_eager_limit,
360 sm->btl_component->btl_version.mca_component_name,
361 sizeof(mca_pml_ob1_hdr_t),
362 sm->btl_component->btl_version.mca_component_name);
363 rc = OMPI_ERR_BAD_PARAM;
364 goto cleanup_and_return;
365 }
366 #if OPAL_CUDA_GDR_SUPPORT
367
368
369 if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
370 sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
371 }
372
373
374 if (0 != sm->btl_module->btl_cuda_eager_limit) {
375 if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
376 opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
377 true,
378 sm->btl_component->btl_version.mca_component_name,
379 ompi_process_info.nodename,
380 sm->btl_component->btl_version.mca_component_name,
381 sm->btl_module->btl_cuda_eager_limit,
382 sm->btl_component->btl_version.mca_component_name,
383 sizeof(mca_pml_ob1_hdr_t),
384 sm->btl_component->btl_version.mca_component_name);
385 rc = OMPI_ERR_BAD_PARAM;
386 goto cleanup_and_return;
387 }
388 }
389 if (0 == sm->btl_module->btl_cuda_rdma_limit) {
390
391 sm->btl_module->btl_cuda_rdma_limit = SIZE_MAX;
392 } else {
393 if (sm->btl_module->btl_cuda_rdma_limit < sm->btl_module->btl_cuda_eager_limit) {
394 opal_show_help("help-mpi-pml-ob1.txt", "cuda_rdma_limit_too_small",
395 true,
396 sm->btl_component->btl_version.mca_component_name,
397 ompi_process_info.nodename,
398 sm->btl_component->btl_version.mca_component_name,
399 sm->btl_module->btl_cuda_rdma_limit,
400 sm->btl_component->btl_version.mca_component_name,
401 sm->btl_module->btl_cuda_eager_limit,
402 sm->btl_component->btl_version.mca_component_name);
403 rc = OMPI_ERR_BAD_PARAM;
404 goto cleanup_and_return;
405 }
406 }
407 #endif
408 }
409
410
411
412 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_MATCH,
413 mca_pml_ob1_recv_frag_callback_match,
414 NULL );
415 if(OMPI_SUCCESS != rc)
416 goto cleanup_and_return;
417
418 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_RNDV,
419 mca_pml_ob1_recv_frag_callback_rndv,
420 NULL );
421 if(OMPI_SUCCESS != rc)
422 goto cleanup_and_return;
423
424 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_RGET,
425 mca_pml_ob1_recv_frag_callback_rget,
426 NULL );
427 if(OMPI_SUCCESS != rc)
428 goto cleanup_and_return;
429
430 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_ACK,
431 mca_pml_ob1_recv_frag_callback_ack,
432 NULL );
433 if(OMPI_SUCCESS != rc)
434 goto cleanup_and_return;
435
436 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_FRAG,
437 mca_pml_ob1_recv_frag_callback_frag,
438 NULL );
439 if(OMPI_SUCCESS != rc)
440 goto cleanup_and_return;
441
442 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_PUT,
443 mca_pml_ob1_recv_frag_callback_put,
444 NULL );
445 if(OMPI_SUCCESS != rc)
446 goto cleanup_and_return;
447
448 rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_FIN,
449 mca_pml_ob1_recv_frag_callback_fin,
450 NULL );
451 if(OMPI_SUCCESS != rc)
452 goto cleanup_and_return;
453
454
455 rc = mca_bml.bml_register_error(mca_pml_ob1_error_handler);
456 if(OMPI_SUCCESS != rc)
457 goto cleanup_and_return;
458
459 cleanup_and_return:
460 OBJ_DESTRUCT(&reachable);
461
462 return rc;
463 }
464
465
466
467
468
469
470 int mca_pml_ob1_del_procs(ompi_proc_t** procs, size_t nprocs)
471 {
472 return mca_bml.bml_del_procs(nprocs, procs);
473 }
474
475
476
477
478
479 static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
480 {
481 char *type, header[128];
482
483 switch(hdr->hdr_common.hdr_type) {
484 case MCA_PML_OB1_HDR_TYPE_MATCH:
485 type = "MATCH";
486 snprintf( header, 128, "ctx %5d src %d tag %d seq %d",
487 hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src,
488 hdr->hdr_match.hdr_tag, hdr->hdr_match.hdr_seq);
489 break;
490 case MCA_PML_OB1_HDR_TYPE_RNDV:
491 type = "RNDV";
492 snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64,
493 hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
494 hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
495 hdr->hdr_rndv.hdr_msg_length);
496 break;
497 case MCA_PML_OB1_HDR_TYPE_RGET:
498 type = "RGET";
499 snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
500 "frag %" PRIu64 " src_ptr %" PRIu64,
501 hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
502 hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
503 hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
504 hdr->hdr_rget.hdr_src_ptr);
505 break;
506 case MCA_PML_OB1_HDR_TYPE_ACK:
507 type = "ACK";
508 snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
509 hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
510 hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
511 break;
512 case MCA_PML_OB1_HDR_TYPE_FRAG:
513 type = "FRAG";
514 snprintf( header, 128, "offset %" PRIu64 " src_req %p dst_req %p",
515 hdr->hdr_frag.hdr_frag_offset,
516 hdr->hdr_frag.hdr_src_req.pval, hdr->hdr_frag.hdr_dst_req.pval);
517 break;
518 case MCA_PML_OB1_HDR_TYPE_PUT:
519 type = "PUT";
520 snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
521 " dst_ptr %" PRIu64 " dst_size %" PRIu64,
522 hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
523 hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
524 hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
525 break;
526 case MCA_PML_OB1_HDR_TYPE_FIN:
527 type = "FIN";
528 header[0] = '\0';
529 break;
530 default:
531 type = "UNKWN";
532 header[0] = '\0';
533 break;
534 }
535 opal_output(0,"hdr %s [%s] %s", type,
536 (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO ? "nbo" : " "),
537 header);
538 }
539
540 #if !MCA_PML_OB1_CUSTOM_MATCH
541 static void mca_pml_ob1_dump_frag_list(opal_list_t* queue, bool is_req)
542 {
543 opal_list_item_t* item;
544 char cpeer[64], ctag[64];
545
546 for( item = opal_list_get_first(queue);
547 item != opal_list_get_end(queue);
548 item = opal_list_get_next(item) ) {
549
550 if( is_req ) {
551 mca_pml_base_request_t *req = &(((mca_pml_ob1_recv_request_t*)item)->req_recv.req_base);
552
553 if( OMPI_ANY_SOURCE == req->req_peer ) snprintf(cpeer, 64, "%s", "ANY_SOURCE");
554 else snprintf(cpeer, 64, "%d", req->req_peer);
555
556 if( OMPI_ANY_TAG == req->req_tag ) snprintf(ctag, 64, "%s", "ANY_TAG");
557 else snprintf(ctag, 64, "%d", req->req_tag);
558
559 opal_output(0, "req %p peer %s tag %s addr %p count %lu datatype %s [%p] [%s %s] req_seq %" PRIu64,
560 (void*) req, cpeer, ctag,
561 (void*) req->req_addr, req->req_count,
562 (0 != req->req_count ? req->req_datatype->name : "N/A"),
563 (void*) req->req_datatype,
564 (req->req_pml_complete ? "pml_complete" : ""),
565 (req->req_free_called ? "freed" : ""),
566 req->req_sequence);
567 } else {
568 mca_pml_ob1_recv_frag_t* frag = (mca_pml_ob1_recv_frag_t*)item;
569 mca_pml_ob1_dump_hdr( &frag->hdr );
570 }
571 }
572 }
573 #endif
574
575 void mca_pml_ob1_dump_cant_match(mca_pml_ob1_recv_frag_t* queue)
576 {
577 mca_pml_ob1_recv_frag_t* item = queue;
578
579 do {
580 mca_pml_ob1_dump_hdr( &item->hdr );
581 if( NULL != item->range ) {
582 mca_pml_ob1_recv_frag_t* frag = item->range;
583 do {
584 mca_pml_ob1_dump_hdr( &frag->hdr );
585 frag = (mca_pml_ob1_recv_frag_t*)frag->super.super.opal_list_next;
586 } while( frag != item->range );
587 }
588 item = (mca_pml_ob1_recv_frag_t*)item->super.super.opal_list_next;
589 } while( item != queue );
590 }
591
592 int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
593 {
594 struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
595 int i;
596
597
598
599 opal_output(0, "Communicator %s [%p](%d) rank %d recv_seq %d num_procs %lu last_probed %lu\n",
600 comm->c_name, (void*) comm, comm->c_contextid, comm->c_my_rank,
601 pml_comm->recv_sequence, pml_comm->num_procs, pml_comm->last_probed);
602
603 #if !MCA_PML_OB1_CUSTOM_MATCH
604 if( opal_list_get_size(&pml_comm->wild_receives) ) {
605 opal_output(0, "expected MPI_ANY_SOURCE fragments\n");
606 mca_pml_ob1_dump_frag_list(&pml_comm->wild_receives, true);
607 }
608 #endif
609
610 #if MCA_PML_OB1_CUSTOM_MATCH
611 opal_output(0, "expected receives\n");
612 custom_match_prq_dump(pml_comm->prq);
613 opal_output(0, "unexpected frag\n");
614 custom_match_umq_dump(pml_comm->umq);
615 #endif
616
617
618 for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
619 mca_pml_ob1_comm_proc_t* proc = pml_comm->procs[i];
620
621 if (NULL == proc) {
622 continue;
623 }
624
625 mca_bml_base_endpoint_t* ep = mca_bml_base_get_endpoint(proc->ompi_proc);
626 size_t n;
627
628 opal_output(0, "[Rank %d] expected_seq %d ompi_proc %p send_seq %d\n",
629 i, proc->expected_sequence, (void*) proc->ompi_proc,
630 proc->send_sequence);
631
632
633 #if !MCA_PML_OB1_CUSTOM_MATCH
634 if( opal_list_get_size(&proc->specific_receives) ) {
635 opal_output(0, "expected specific receives\n");
636 mca_pml_ob1_dump_frag_list(&proc->specific_receives, true);
637 }
638 #endif
639 if( NULL != proc->frags_cant_match ) {
640 opal_output(0, "out of sequence\n");
641 mca_pml_ob1_dump_cant_match(proc->frags_cant_match);
642 }
643 #if !MCA_PML_OB1_CUSTOM_MATCH
644 if( opal_list_get_size(&proc->unexpected_frags) ) {
645 opal_output(0, "unexpected frag\n");
646 mca_pml_ob1_dump_frag_list(&proc->unexpected_frags, false);
647 }
648 #endif
649
650 for( n = 0; n < ep->btl_eager.arr_size; n++ ) {
651 mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n];
652 bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
653 }
654 }
655 return OMPI_SUCCESS;
656 }
657
658 static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
659 struct mca_btl_base_endpoint_t* ep,
660 struct mca_btl_base_descriptor_t* des,
661 int status )
662 {
663
664 mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
665
666
667 MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
668 }
669
670
671
672
673
674
675
676 int mca_pml_ob1_send_fin( ompi_proc_t* proc,
677 mca_bml_base_btl_t* bml_btl,
678 opal_ptr_t hdr_frag,
679 uint64_t rdma_size,
680 uint8_t order,
681 int status )
682 {
683 mca_btl_base_descriptor_t* fin;
684 int rc;
685
686 mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
687 MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL);
688
689 if(NULL == fin) {
690 MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
691 return OMPI_ERR_OUT_OF_RESOURCE;
692 }
693 fin->des_cbfunc = mca_pml_ob1_fin_completion;
694 fin->des_cbdata = NULL;
695
696
697 mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
698 0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
699
700 ob1_hdr_hton((mca_pml_ob1_hdr_t *) fin->des_segments->seg_addr.pval, MCA_PML_OB1_HDR_TYPE_FIN, proc);
701
702
703 rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
704 if( OPAL_LIKELY( rc >= 0 ) ) {
705 if( OPAL_LIKELY( 1 == rc ) ) {
706 MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
707 }
708 return OMPI_SUCCESS;
709 }
710 mca_bml_base_free(bml_btl, fin);
711 MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
712 return OMPI_ERR_OUT_OF_RESOURCE;
713 }
714
715 void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
716 {
717 mca_pml_ob1_pckt_pending_t *pckt;
718 int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_ob1.pckt_pending);
719
720 for(i = 0; i < s; i++) {
721 mca_bml_base_btl_t *send_dst = NULL;
722 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
723 pckt = (mca_pml_ob1_pckt_pending_t*)
724 opal_list_remove_first(&mca_pml_ob1.pckt_pending);
725 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
726 if(NULL == pckt)
727 break;
728 if(pckt->bml_btl != NULL &&
729 pckt->bml_btl->btl == bml_btl->btl) {
730 send_dst = pckt->bml_btl;
731 } else {
732 mca_bml_base_endpoint_t* endpoint =
733 (mca_bml_base_endpoint_t*) pckt->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
734 send_dst = mca_bml_base_btl_array_find(
735 &endpoint->btl_eager, bml_btl->btl);
736 }
737 if(NULL == send_dst) {
738 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
739 opal_list_append(&mca_pml_ob1.pckt_pending,
740 (opal_list_item_t*)pckt);
741 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
742 continue;
743 }
744
745 switch(pckt->hdr.hdr_common.hdr_type) {
746 case MCA_PML_OB1_HDR_TYPE_ACK:
747 rc = mca_pml_ob1_recv_request_ack_send_btl(pckt->proc,
748 send_dst,
749 pckt->hdr.hdr_ack.hdr_src_req.lval,
750 pckt->hdr.hdr_ack.hdr_dst_req.pval,
751 pckt->hdr.hdr_ack.hdr_send_offset,
752 pckt->hdr.hdr_ack.hdr_send_size,
753 pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
754 if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
755 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
756 opal_list_append(&mca_pml_ob1.pckt_pending,
757 (opal_list_item_t*)pckt);
758 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
759 return;
760 }
761 break;
762 case MCA_PML_OB1_HDR_TYPE_FIN:
763 rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
764 pckt->hdr.hdr_fin.hdr_frag,
765 pckt->hdr.hdr_fin.hdr_size,
766 pckt->order,
767 pckt->status);
768 if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
769 MCA_PML_OB1_PCKT_PENDING_RETURN(pckt);
770 return;
771 }
772 break;
773 default:
774 opal_output(0, "[%s:%d] wrong header type\n",
775 __FILE__, __LINE__);
776 break;
777 }
778
779 MCA_PML_OB1_PCKT_PENDING_RETURN(pckt);
780 }
781 }
782
783 void mca_pml_ob1_process_pending_rdma(void)
784 {
785 mca_pml_ob1_rdma_frag_t* frag;
786 int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_ob1.rdma_pending);
787
788 for(i = 0; i < s; i++) {
789 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
790 frag = (mca_pml_ob1_rdma_frag_t*)
791 opal_list_remove_first(&mca_pml_ob1.rdma_pending);
792 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
793 if(NULL == frag)
794 break;
795
796 frag->retries++;
797
798 if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) {
799 rc = mca_pml_ob1_send_request_put_frag(frag);
800 } else {
801 rc = mca_pml_ob1_recv_request_get_frag(frag);
802 }
803 if(OMPI_ERR_OUT_OF_RESOURCE == rc)
804 break;
805 }
806 }
807
808
809 void mca_pml_ob1_error_handler(
810 struct mca_btl_base_module_t* btl, int32_t flags,
811 opal_proc_t* errproc, char* btlinfo ) {
812 #if OPAL_CUDA_SUPPORT
813 if (flags & MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC) {
814 mca_pml_ob1_cuda_add_ipc_support(btl, flags, (struct ompi_proc_t*)errproc, btlinfo);
815 return;
816 }
817 #endif
818 ompi_rte_abort(-1, btlinfo);
819 }
820
821 #if OPAL_ENABLE_FT_CR == 0
822 int mca_pml_ob1_ft_event( int state ) {
823 return OMPI_SUCCESS;
824 }
825 #else
826 int mca_pml_ob1_ft_event( int state )
827 {
828 static bool first_continue_pass = false;
829 ompi_proc_t** procs = NULL;
830 size_t num_procs;
831 int ret, p;
832
833 if(OPAL_CRS_CHECKPOINT == state) {
834 if( opal_cr_timing_barrier_enabled ) {
835 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
836 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
837 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
838 return ret;
839 }
840 }
841
842 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
843 }
844 else if(OPAL_CRS_CONTINUE == state) {
845 first_continue_pass = !first_continue_pass;
846
847 if( !first_continue_pass ) {
848 if( opal_cr_timing_barrier_enabled ) {
849 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
850 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
851 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
852 return ret;
853 }
854 }
855 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
856 }
857
858 if (opal_cr_continue_like_restart && !first_continue_pass) {
859
860
861
862 procs = ompi_proc_all(&num_procs);
863 if(NULL == procs) {
864 return OMPI_ERR_OUT_OF_RESOURCE;
865 }
866
867
868
869
870
871
872
873
874 if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
875 opal_output(0,
876 "pml:ob1: ft_event(Restart): proc_refresh Failed %d",
877 ret);
878 for(p = 0; p < (int)num_procs; ++p) {
879 OBJ_RELEASE(procs[p]);
880 }
881 free (procs);
882 return ret;
883 }
884 }
885 }
886 else if(OPAL_CRS_RESTART_PRE == state ) {
887
888 }
889 else if(OPAL_CRS_RESTART == state ) {
890
891
892
893 procs = ompi_proc_all(&num_procs);
894 if(NULL == procs) {
895 return OMPI_ERR_OUT_OF_RESOURCE;
896 }
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913 if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
914 opal_output(0,
915 "pml:ob1: ft_event(Restart): proc_refresh Failed %d",
916 ret);
917 for(p = 0; p < (int)num_procs; ++p) {
918 OBJ_RELEASE(procs[p]);
919 }
920 free (procs);
921 return ret;
922 }
923 }
924 else if(OPAL_CRS_TERM == state ) {
925 ;
926 }
927 else {
928 ;
929 }
930
931
932
933
934
935
936 if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
937 opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
938 ret);
939 }
940
941 if(OPAL_CRS_CHECKPOINT == state) {
942 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
943
944 if( opal_cr_timing_barrier_enabled ) {
945 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
946
947 }
948 }
949 else if(OPAL_CRS_CONTINUE == state) {
950 if( !first_continue_pass ) {
951 if( opal_cr_timing_barrier_enabled ) {
952 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
953 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
954 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
955 return ret;
956 }
957 }
958 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
959 }
960
961 if (opal_cr_continue_like_restart && !first_continue_pass) {
962 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
963 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
964 return ret;
965 }
966
967
968
969
970
971 if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
972 opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
973 return ret;
974 }
975
976
977 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
978 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
979 return ret;
980 }
981
982 if( NULL != procs ) {
983 for(p = 0; p < (int)num_procs; ++p) {
984 OBJ_RELEASE(procs[p]);
985 }
986 free(procs);
987 procs = NULL;
988 }
989 }
990 if( !first_continue_pass ) {
991 if( opal_cr_timing_barrier_enabled ) {
992 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
993 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
994 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
995 return ret;
996 }
997 }
998 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
999 }
1000 }
1001 else if(OPAL_CRS_RESTART_PRE == state ) {
1002
1003 }
1004 else if(OPAL_CRS_RESTART == state ) {
1005
1006
1007
1008
1009 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
1010 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
1011 return ret;
1012 }
1013
1014
1015
1016
1017
1018 if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
1019 opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
1020 return ret;
1021 }
1022
1023
1024 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
1025 opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
1026 return ret;
1027 }
1028
1029 if( NULL != procs ) {
1030 for(p = 0; p < (int)num_procs; ++p) {
1031 OBJ_RELEASE(procs[p]);
1032 }
1033 free(procs);
1034 procs = NULL;
1035 }
1036 }
1037 else if(OPAL_CRS_TERM == state ) {
1038 ;
1039 }
1040 else {
1041 ;
1042 }
1043
1044 return OMPI_SUCCESS;
1045 }
1046 #endif
1047
1048 int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
1049 {
1050 const mca_pml_ob1_com_btl_t *b1 = (const mca_pml_ob1_com_btl_t *) v1;
1051 const mca_pml_ob1_com_btl_t *b2 = (const mca_pml_ob1_com_btl_t *) v2;
1052
1053 if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
1054 return 1;
1055 if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
1056 return -1;
1057
1058 return 0;
1059 }
1060