This source file includes following definitions.
- lock_send_request
- unlock_send_request
- add_request_to_send_pending
- get_request_from_send_pending
- mca_pml_ob1_free_rdma_resources
- mca_pml_ob1_send_request_fini
- send_request_pml_complete
- send_request_pml_complete_check
- mca_pml_ob1_send_request_schedule_exclusive
- mca_pml_ob1_send_request_schedule
- mca_pml_ob1_send_request_start_btl
- mca_pml_ob1_send_request_start_seq
- mca_pml_ob1_send_request_start
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 #ifndef OMPI_PML_OB1_SEND_REQUEST_H
26 #define OMPI_PML_OB1_SEND_REQUEST_H
27
28 #include "opal/datatype/opal_convertor.h"
29 #include "opal/mca/mpool/base/base.h"
30 #include "ompi/mca/pml/base/pml_base_sendreq.h"
31 #include "pml_ob1_comm.h"
32 #include "pml_ob1_hdr.h"
33 #include "pml_ob1_rdma.h"
34 #include "pml_ob1_rdmafrag.h"
35 #include "ompi/mca/bml/bml.h"
36
37 BEGIN_C_DECLS
38
39 typedef enum {
40 MCA_PML_OB1_SEND_PENDING_NONE,
41 MCA_PML_OB1_SEND_PENDING_SCHEDULE,
42 MCA_PML_OB1_SEND_PENDING_START
43 } mca_pml_ob1_send_pending_t;
44
45 struct mca_pml_ob1_send_request_t {
46 mca_pml_base_send_request_t req_send;
47 mca_bml_base_endpoint_t* req_endpoint;
48 opal_ptr_t req_recv;
49 opal_atomic_int32_t req_state;
50 opal_atomic_int32_t req_lock;
51 bool req_throttle_sends;
52 opal_atomic_int32_t req_pipeline_depth;
53 opal_atomic_size_t req_bytes_delivered;
54 uint32_t req_rdma_cnt;
55 mca_pml_ob1_send_pending_t req_pending;
56 opal_mutex_t req_send_range_lock;
57 opal_list_t req_send_ranges;
58 mca_pml_ob1_rdma_frag_t *rdma_frag;
59
60 mca_pml_ob1_com_btl_t req_rdma[];
61 };
62 typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
63
64 OBJ_CLASS_DECLARATION(mca_pml_ob1_send_request_t);
65
66 struct mca_pml_ob1_send_range_t {
67 opal_free_list_item_t base;
68 uint64_t range_send_offset;
69 uint64_t range_send_length;
70 int range_btl_idx;
71 int range_btl_cnt;
72
73 mca_pml_ob1_com_btl_t range_btls[];
74 };
75 typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t;
76 OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);
77
78 static inline bool lock_send_request(mca_pml_ob1_send_request_t *sendreq)
79 {
80 return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, 1) == 1;
81 }
82
83 static inline bool unlock_send_request(mca_pml_ob1_send_request_t *sendreq)
84 {
85 return OPAL_THREAD_ADD_FETCH32(&sendreq->req_lock, -1) == 0;
86 }
87
88 static inline void
89 add_request_to_send_pending(mca_pml_ob1_send_request_t* sendreq,
90 const mca_pml_ob1_send_pending_t type,
91 const bool append)
92 {
93 opal_list_item_t *item = (opal_list_item_t*)sendreq;
94
95 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
96 sendreq->req_pending = type;
97 if(append)
98 opal_list_append(&mca_pml_ob1.send_pending, item);
99 else
100 opal_list_prepend(&mca_pml_ob1.send_pending, item);
101
102 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
103 mca_pml_ob1_enable_progress(1);
104 }
105
106 static inline mca_pml_ob1_send_request_t*
107 get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
108 {
109 mca_pml_ob1_send_request_t *sendreq;
110
111 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
112 sendreq = (mca_pml_ob1_send_request_t*)
113 opal_list_remove_first(&mca_pml_ob1.send_pending);
114 if(sendreq) {
115 *type = sendreq->req_pending;
116 sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE;
117 }
118 OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
119
120 return sendreq;
121 }
122
123 #define MCA_PML_OB1_SEND_REQUEST_ALLOC( comm, \
124 dst, \
125 sendreq) \
126 { \
127 ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
128 \
129 if( OPAL_LIKELY(NULL != proc) ) { \
130 sendreq = (mca_pml_ob1_send_request_t*) \
131 opal_free_list_wait (&mca_pml_base_send_requests); \
132 sendreq->req_send.req_base.req_proc = proc; \
133 } \
134 }
135
136
137 #define MCA_PML_OB1_SEND_REQUEST_INIT( sendreq, \
138 buf, \
139 count, \
140 datatype, \
141 dst, \
142 tag, \
143 comm, \
144 sendmode, \
145 persistent) \
146 { \
147 MCA_PML_BASE_SEND_REQUEST_INIT(&(sendreq)->req_send, \
148 buf, \
149 count, \
150 datatype, \
151 dst, \
152 tag, \
153 comm, \
154 sendmode, \
155 persistent, \
156 0); \
157 (sendreq)->req_recv.pval = NULL; \
158 }
159
160 #define MCA_PML_OB1_SEND_REQUEST_RESET(sendreq) \
161 MCA_PML_BASE_SEND_REQUEST_RESET(&(sendreq)->req_send)
162
163 static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
164 {
165 size_t r;
166
167
168 for(r = 0; r < sendreq->req_rdma_cnt; r++) {
169 struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
170 mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
171
172 if (NULL != handle) {
173 mca_bml_base_deregister_mem (bml_btl, handle);
174 sendreq->req_rdma[r].btl_reg = NULL;
175 }
176 }
177 sendreq->req_rdma_cnt = 0;
178 }
179
180
181
182
183
184
185 #define MCA_PML_OB1_SEND_REQUEST_START(sendreq, rc) \
186 do { \
187 rc = mca_pml_ob1_send_request_start(sendreq); \
188 } while (0)
189
190 #define MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seq, rc) \
191 do { \
192 rc = mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seq); \
193 } while (0)
194
195
196
197
198
199
200 #define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
201 do { \
202 (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
203 (sendreq)->req_send.req_base.req_comm->c_my_rank; \
204 (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
205 (sendreq)->req_send.req_base.req_tag; \
206 (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
207 (sendreq)->req_send.req_base.req_ompi.req_status._ucount = \
208 (sendreq)->req_send.req_bytes_packed; \
209 PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
210 &(sendreq->req_send.req_base), PERUSE_SEND); \
211 \
212 ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
213 } while(0)
214
215 static inline void mca_pml_ob1_send_request_fini (mca_pml_ob1_send_request_t *sendreq)
216 {
217
218 MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send));
219 assert( NULL == sendreq->rdma_frag );
220 }
221
222
223
224
225
226 #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
227 do { \
228 mca_pml_ob1_send_request_fini (sendreq); \
229 opal_free_list_return ( &mca_pml_base_send_requests, \
230 (opal_free_list_item_t*)sendreq); \
231 sendreq = NULL; \
232 } while(0)
233
234
235
236
237
238
239
240
241
242
243 static inline void
244 send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq)
245 {
246 if(false == sendreq->req_send.req_base.req_pml_complete) {
247 if(sendreq->req_send.req_bytes_packed > 0) {
248 PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
249 &(sendreq->req_send.req_base), PERUSE_SEND);
250 }
251
252
253 mca_pml_ob1_free_rdma_resources(sendreq);
254
255 if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
256 sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
257 mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
258 }
259
260 if (!sendreq->req_send.req_base.req_free_called) {
261 sendreq->req_send.req_base.req_pml_complete = true;
262
263 if( !REQUEST_COMPLETE( &((sendreq->req_send).req_base.req_ompi)) ) {
264
265 MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
266 } else {
267 if( MPI_SUCCESS != sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR ) {
268 ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
269 }
270 }
271 } else {
272 MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq);
273 }
274 }
275 }
276
277
278 static inline bool
279 send_request_pml_complete_check(mca_pml_ob1_send_request_t *sendreq)
280 {
281 opal_atomic_rmb();
282
283
284
285
286
287
288 if(sendreq->req_state == 0 &&
289 sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
290 && lock_send_request(sendreq)) {
291 send_request_pml_complete(sendreq);
292 return true;
293 }
294
295 return false;
296 }
297
298
299
300
301 int
302 mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t*);
303
304 static inline int
305 mca_pml_ob1_send_request_schedule_exclusive(mca_pml_ob1_send_request_t* sendreq)
306 {
307 int rc;
308 do {
309 rc = mca_pml_ob1_send_request_schedule_once(sendreq);
310 if(rc == OMPI_ERR_OUT_OF_RESOURCE)
311 break;
312 } while(!unlock_send_request(sendreq));
313
314 if(OMPI_SUCCESS == rc)
315 send_request_pml_complete_check(sendreq);
316
317 return rc;
318 }
319
320 static inline void
321 mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq)
322 {
323
324
325
326
327
328
329
330 if(!lock_send_request(sendreq))
331 return;
332
333 mca_pml_ob1_send_request_schedule_exclusive(sendreq);
334 }
335
336 #if OPAL_CUDA_SUPPORT
337 int mca_pml_ob1_send_request_start_cuda(
338 mca_pml_ob1_send_request_t* sendreq,
339 mca_bml_base_btl_t* bml_btl,
340 size_t size);
341 #endif
342
343
344
345
346
347 int mca_pml_ob1_send_request_start_buffered(
348 mca_pml_ob1_send_request_t* sendreq,
349 mca_bml_base_btl_t* bml_btl,
350 size_t size);
351
352 int mca_pml_ob1_send_request_start_copy(
353 mca_pml_ob1_send_request_t* sendreq,
354 mca_bml_base_btl_t* bml_btl,
355 size_t size);
356
357 int mca_pml_ob1_send_request_start_prepare(
358 mca_pml_ob1_send_request_t* sendreq,
359 mca_bml_base_btl_t* bml_btl,
360 size_t size);
361
362 int mca_pml_ob1_send_request_start_rdma(
363 mca_pml_ob1_send_request_t* sendreq,
364 mca_bml_base_btl_t* bml_btl,
365 size_t size);
366
367 int mca_pml_ob1_send_request_start_rndv(
368 mca_pml_ob1_send_request_t* sendreq,
369 mca_bml_base_btl_t* bml_btl,
370 size_t size,
371 int flags);
372
373 static inline int
374 mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
375 mca_bml_base_btl_t* bml_btl )
376 {
377 size_t size = sendreq->req_send.req_bytes_packed;
378 mca_btl_base_module_t* btl = bml_btl->btl;
379 size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t);
380 int rc;
381
382 #if OPAL_CUDA_GDR_SUPPORT
383 if (btl->btl_cuda_eager_limit && (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
384 eager_limit = btl->btl_cuda_eager_limit - sizeof(mca_pml_ob1_hdr_t);
385 }
386 #endif
387
388 if( OPAL_LIKELY(size <= eager_limit) ) {
389 switch(sendreq->req_send.req_send_mode) {
390 case MCA_PML_BASE_SEND_SYNCHRONOUS:
391 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
392 break;
393 case MCA_PML_BASE_SEND_BUFFERED:
394 rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size);
395 break;
396 case MCA_PML_BASE_SEND_COMPLETE:
397 rc = mca_pml_ob1_send_request_start_prepare(sendreq, bml_btl, size);
398 break;
399 default:
400 if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
401 rc = mca_pml_ob1_send_request_start_prepare(sendreq, bml_btl, size);
402 } else {
403 rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size);
404 }
405 break;
406 }
407 } else {
408 size = eager_limit;
409 if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
410 size = btl->btl_rndv_eager_limit;
411 if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
412 rc = mca_pml_ob1_send_request_start_buffered(sendreq, bml_btl, size);
413 } else if
414 (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
415 unsigned char *base;
416 opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
417
418 if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_btls(
419 sendreq->req_endpoint,
420 base,
421 sendreq->req_send.req_bytes_packed,
422 sendreq->req_rdma))) {
423 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
424 sendreq->req_send.req_bytes_packed);
425 if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
426 mca_pml_ob1_free_rdma_resources(sendreq);
427 }
428 } else {
429 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
430 MCA_PML_OB1_HDR_FLAGS_CONTIG);
431 }
432 } else {
433 #if OPAL_CUDA_SUPPORT
434 if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
435 return mca_pml_ob1_send_request_start_cuda(sendreq, bml_btl, size);
436 }
437 #endif
438 rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
439 }
440 }
441
442 return rc;
443 }
444
445 static inline int
446 mca_pml_ob1_send_request_start_seq (mca_pml_ob1_send_request_t* sendreq, mca_bml_base_endpoint_t* endpoint, int32_t seqn)
447 {
448 sendreq->req_endpoint = endpoint;
449 sendreq->req_state = 0;
450 sendreq->req_lock = 0;
451 sendreq->req_pipeline_depth = 0;
452 sendreq->req_bytes_delivered = 0;
453 sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE;
454 sendreq->req_send.req_base.req_sequence = seqn;
455
456 MCA_PML_BASE_SEND_START( &sendreq->req_send );
457
458 for(size_t i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
459 mca_bml_base_btl_t* bml_btl;
460 int rc;
461
462
463 bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
464 rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
465 if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
466 return rc;
467 }
468 if(MCA_PML_BASE_SEND_BUFFERED == sendreq->req_send.req_send_mode &&
469 sendreq->req_send.req_addr == sendreq->req_send.req_base.req_addr) {
470
471
472 int rc;
473 rc = mca_pml_base_bsend_request_start((ompi_request_t*)sendreq);
474 if(OMPI_SUCCESS != rc){
475 return rc;
476 }
477 }
478 add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true);
479
480 return OMPI_SUCCESS;
481 }
482
483 static inline int
484 mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
485 {
486 mca_bml_base_endpoint_t *endpoint = mca_bml_base_get_endpoint (sendreq->req_send.req_base.req_proc);
487 ompi_communicator_t *comm = sendreq->req_send.req_base.req_comm;
488 mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, sendreq->req_send.req_base.req_peer);
489 int32_t seqn;
490
491 if (OPAL_UNLIKELY(NULL == endpoint)) {
492 return OMPI_ERR_UNREACH;
493 }
494
495 seqn = OPAL_THREAD_ADD_FETCH32(&ob1_proc->send_sequence, 1);
496
497 return mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seqn);
498 }
499
500
501
502
503
504 void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
505 mca_btl_base_module_t* btl,
506 mca_pml_ob1_rdma_hdr_t* hdr );
507
508 int mca_pml_ob1_send_request_put_frag(mca_pml_ob1_rdma_frag_t* frag);
509
510
511
512
513
514
515
516
517
518 void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
519
520 void mca_pml_ob1_send_request_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
521 uint64_t send_offset, uint64_t send_length);
522
523 END_C_DECLS
524
525 #endif