This source file includes following definitions.
- ompi_osc_rdma_cleanup_rdma
- ompi_osc_get_data_complete
- ompi_osc_get_data_blocking
- ompi_osc_rdma_master_noncontig
- ompi_osc_rdma_master
- ompi_osc_rdma_copy_local
- ompi_osc_rdma_put_complete
- ompi_osc_rdma_put_complete_flush
- ompi_osc_rdma_put_real
- ompi_osc_rdma_put_contig
- ompi_osc_rdma_get_complete
- ompi_osc_rdma_get_partial
- ompi_osc_rdma_get_contig
- ompi_osc_rdma_put_w_req
- ompi_osc_rdma_get_w_req
- ompi_osc_rdma_put
- ompi_osc_rdma_rput
- ompi_osc_rdma_get
- ompi_osc_rdma_rget
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #include "osc_rdma_comm.h"
17 #include "osc_rdma_sync.h"
18 #include "osc_rdma_request.h"
19 #include "osc_rdma_dynamic.h"
20
21 #include "ompi/mca/osc/base/osc_base_obj_convert.h"
22 #include "opal/align.h"
23
24
25 static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag,
26 mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
27 {
28 if (frag) {
29 ompi_osc_rdma_frag_complete (frag);
30 } else {
31 ompi_osc_rdma_deregister (sync->module, handle);
32 }
33
34 if (request) {
35 (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
36 }
37
38 if (dec_always) {
39 ompi_osc_rdma_sync_rdma_dec_always (sync);
40 } else {
41 ompi_osc_rdma_sync_rdma_dec (sync);
42 }
43 }
44
45 static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
46 mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
47 ompi_osc_rdma_request_t *request);
48
49 static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
50 void *local_address, mca_btl_base_registration_handle_t *local_handle,
51 void *context, void *data, int status)
52 {
53 assert (OPAL_SUCCESS == status);
54 ((bool *) context)[0] = true;
55 }
56
57 int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
58 uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
59 void *data, size_t len)
60 {
61 const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
62 mca_btl_base_registration_handle_t *local_handle = NULL;
63 ompi_osc_rdma_frag_t *frag = NULL;
64 volatile bool read_complete = false;
65 size_t aligned_len, offset;
66 uint64_t aligned_addr = source_address & ~btl_alignment_mask;
67 char *ptr = data;
68 int ret;
69
70 offset = source_address & btl_alignment_mask;
71 aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
72
73 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading data from endpoint %p. source: 0x%" PRIx64 " (aligned: 0x%" PRIx64
74 "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len,
75 (unsigned long) aligned_len);
76
77 if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) {
78 do {
79 ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
80 if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) {
81 ompi_osc_rdma_progress (module);
82 }
83 } while (OMPI_ERR_OUT_OF_RESOURCE == ret);
84
85 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
86 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer");
87 return ret;
88 }
89
90 local_handle = frag->handle;
91 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr,
92 (void *) frag);
93 }
94
95 assert (!(source_address & ALIGNMENT_MASK(module->selected_btl->btl_get_alignment)));
96
97 do {
98 ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, aligned_addr,
99 local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
100 ompi_osc_get_data_complete, (void *) &read_complete, NULL);
101 if (!ompi_osc_rdma_oor (ret)) {
102 break;
103 }
104
105 ompi_osc_rdma_progress (module);
106 } while (1);
107
108 if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
109 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret);
110
111 if (frag) {
112 ompi_osc_rdma_frag_complete (frag);
113 }
114
115 return ret;
116 }
117
118
119 while (!read_complete) {
120 ompi_osc_rdma_progress (module);
121 }
122
123 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished reading state data from endpoint %p", (void *) endpoint);
124
125 opal_memchecker_base_mem_defined (ptr, len);
126
127 if (frag) {
128 memcpy (data, ptr + offset, len);
129
130
131 ompi_osc_rdma_frag_complete (frag);
132 }
133
134 return OMPI_SUCCESS;
135 }
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154 typedef int (*ompi_osc_rdma_fn_t) (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
155 mca_btl_base_registration_handle_t *remote_handle, void *local_address, size_t size,
156 ompi_osc_rdma_request_t *request);
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178 static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count, ompi_datatype_t *local_datatype,
179 ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
180 mca_btl_base_registration_handle_t *remote_handle, int remote_count,
181 ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
182 const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
183 {
184 ompi_osc_rdma_module_t *module = sync->module;
185 struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
186 opal_convertor_t local_convertor, remote_convertor;
187 uint32_t local_iov_count, remote_iov_count;
188 uint32_t local_iov_index, remote_iov_index;
189
190 size_t local_size, remote_size, rdma_len;
191 ompi_osc_rdma_request_t *subreq;
192 int ret;
193 bool done;
194
195 subreq = NULL;
196
197 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s) or large region");
198
199
200
201 OBJ_CONSTRUCT(&remote_convertor, opal_convertor_t);
202 ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &remote_datatype->super, remote_count,
203 (void *) (intptr_t) remote_address, 0, &remote_convertor);
204 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
205 return ret;
206 }
207
208 OBJ_CONSTRUCT(&local_convertor, opal_convertor_t);
209 ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &local_datatype->super, local_count,
210 local_address, 0, &local_convertor);
211 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
212 return ret;
213 }
214
215 if (request) {
216
217 request->outstanding_requests = 1;
218 }
219
220 local_iov_index = 0;
221 local_iov_count = 0;
222
223 do {
224
225 remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
226 remote_iov_index = 0;
227
228
229 done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size);
230
231
232 while (remote_iov_index != remote_iov_count) {
233 if (local_iov_index == local_iov_count) {
234
235 local_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
236 local_iov_index = 0;
237 (void) opal_convertor_raw (&local_convertor, local_iovec, &local_iov_count, &local_size);
238 }
239
240
241 assert (0 != local_iov_count);
242
243
244 rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len);
245
246
247 if (!subreq && alloc_reqs) {
248 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
249 subreq->internal = true;
250 subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
251 subreq->parent_request = request;
252
253 if (request) {
254 (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
255 }
256 } else if (!alloc_reqs) {
257 subreq = request;
258 }
259
260 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing rdma on contiguous region. local: %p, remote: %p, len: %lu",
261 local_iovec[local_iov_index].iov_base, remote_iovec[remote_iov_index].iov_base,
262 (unsigned long) remote_iovec[remote_iov_index].iov_len);
263
264 ret = rdma_fn (sync, peer, (uint64_t) (intptr_t) remote_iovec[remote_iov_index].iov_base, remote_handle,
265 local_iovec[local_iov_index].iov_base, rdma_len, subreq);
266 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
267 if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
268 if (request) {
269 ompi_osc_rdma_request_deref (request);
270 }
271
272 if (alloc_reqs) {
273 OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
274 }
275
276
277 return ret;
278 }
279
280
281 ompi_osc_rdma_progress (module);
282 continue;
283 }
284 subreq = NULL;
285
286
287 local_iovec[local_iov_index].iov_len -= rdma_len;
288 remote_iovec[remote_iov_index].iov_len -= rdma_len;
289 local_iovec[local_iov_index].iov_base = (void *)((intptr_t) local_iovec[local_iov_index].iov_base + rdma_len);
290 remote_iovec[remote_iov_index].iov_base = (void *)((intptr_t) remote_iovec[remote_iov_index].iov_base + rdma_len);
291
292 local_iov_index += (0 == local_iovec[local_iov_index].iov_len);
293 remote_iov_index += (0 == remote_iovec[remote_iov_index].iov_len);
294 }
295 } while (!done);
296
297 if (request) {
298
299 ompi_osc_rdma_request_deref (request);
300 }
301
302 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
303
304
305 opal_convertor_cleanup (&local_convertor);
306 OBJ_DESTRUCT(&local_convertor);
307 opal_convertor_cleanup (&remote_convertor);
308 OBJ_DESTRUCT(&remote_convertor);
309
310 return OMPI_SUCCESS;
311 }
312
313 static inline int ompi_osc_rdma_master (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count,
314 ompi_datatype_t *local_datatype, ompi_osc_rdma_peer_t *peer,
315 uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
316 int remote_count, ompi_datatype_t *remote_datatype,
317 ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
318 const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
319 {
320 size_t rdma_len;
321 ptrdiff_t lb, extent;
322 int ret;
323
324 rdma_len = local_datatype->super.size * local_count;
325
326
327 if (OPAL_LIKELY(ompi_datatype_is_contiguous_memory_layout (local_datatype, local_count) &&
328 ompi_datatype_is_contiguous_memory_layout (remote_datatype, remote_count) &&
329 rdma_len <= max_rdma_len)) {
330 if (NULL == request && alloc_reqs) {
331 ompi_osc_rdma_module_t *module = sync->module;
332 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
333 request->internal = true;
334 request->type = OMPI_OSC_RDMA_TYPE_RDMA;
335 }
336
337
338 (void) ompi_datatype_get_true_extent (local_datatype, &lb, &extent);
339 local_address = (void *)((intptr_t) local_address + lb);
340
341 (void) ompi_datatype_get_true_extent (remote_datatype, &lb, &extent);
342 remote_address += lb;
343
344 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing rdma on contiguous region. local: %p, "
345 "remote: 0x%lx, length: %lu", local_address, (unsigned long) remote_address,
346 rdma_len);
347
348 do {
349 ret = rdma_fn (sync, peer, remote_address, remote_handle, local_address, rdma_len, request);
350 if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
351 return OMPI_SUCCESS;
352 }
353
354 ompi_osc_rdma_progress (sync->module);
355 } while (1);
356 }
357
358 return ompi_osc_rdma_master_noncontig (sync, local_address, local_count, local_datatype, peer, remote_address,
359 remote_handle, remote_count, remote_datatype, request,
360 max_rdma_len, rdma_fn, alloc_reqs);
361 }
362
363 static int ompi_osc_rdma_copy_local (const void *source, int source_count, ompi_datatype_t *source_datatype,
364 void *target, int target_count, ompi_datatype_t *target_datatype,
365 ompi_osc_rdma_request_t *request)
366 {
367 int ret;
368
369 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing local copy from %p -> %p", source, target);
370
371 opal_atomic_mb ();
372 ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype);
373
374 if (request) {
375 ompi_osc_rdma_request_complete (request, ret);
376 }
377
378 return ret;
379 }
380
381 static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
382 void *local_address, mca_btl_base_registration_handle_t *local_handle,
383 void *context, void *data, int status)
384 {
385 ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
386
387 assert (OPAL_SUCCESS == status);
388
389
390 if ((intptr_t) context & 0x1) {
391 ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
392 sync = request->sync;
393
394
395 ompi_osc_rdma_request_complete (request, status);
396 }
397
398 OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on sync %p. local "
399 "address %p. opal status %d", (void *) sync, local_address, status);
400
401 if (data) {
402 ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
403 } else if (local_handle) {
404 ompi_osc_rdma_deregister (sync->module, local_handle);
405 }
406
407 ompi_osc_rdma_sync_rdma_dec (sync);
408 }
409
410 static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
411 void *local_address, mca_btl_base_registration_handle_t *local_handle,
412 void *context, void *data, int status)
413 {
414 ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) context;
415
416 assert (OPAL_SUCCESS == status);
417
418
419 if ((intptr_t) context & 0x1) {
420 ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
421 module = request->module;
422
423
424 ompi_osc_rdma_request_complete (request, status);
425 }
426
427 OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on module %p. local "
428 "address %p. opal status %d", (void *) module, local_address, status);
429
430 if (data) {
431 ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
432 } else if (local_handle) {
433 ompi_osc_rdma_deregister (module, local_handle);
434 }
435 }
436
437 static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
438 mca_btl_base_registration_handle_t *target_handle, void *ptr,
439 mca_btl_base_registration_handle_t *local_handle, size_t size,
440 mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
441 ompi_osc_rdma_module_t *module = sync->module;
442 int ret;
443
444 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync "
445 "object %p...", (unsigned long) size, target_address, (void *) sync);
446
447
448 ompi_osc_rdma_sync_rdma_inc (sync);
449
450 do {
451 ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
452 local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
453 cb, context, cbdata);
454 if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
455 return OMPI_SUCCESS;
456 }
457
458 ++module->put_retry_count;
459
460 if (!ompi_osc_rdma_oor (ret)) {
461 break;
462 }
463
464
465 ompi_osc_rdma_progress (module);
466 } while (1);
467
468 OSC_RDMA_VERBOSE(10, "btl put failed with opal error code %d", ret);
469
470 return ret;
471 }
472
473 int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
474 mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
475 ompi_osc_rdma_request_t *request)
476 {
477 ompi_osc_rdma_module_t *module = sync->module;
478 mca_btl_base_registration_handle_t *local_handle = NULL;
479 mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
480 ompi_osc_rdma_frag_t *frag = NULL;
481 char *ptr = source_buffer;
482 void *cbcontext;
483 int ret;
484
485 if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) {
486 ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr);
487 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
488 ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle);
489 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
490 return ret;
491 }
492 } else {
493 memcpy (ptr, source_buffer, size);
494 local_handle = frag->handle;
495 }
496 }
497
498 if (ompi_osc_rdma_use_btl_flush (module)) {
499
500
501
502
503
504
505 cbcontext = (void *) module;
506 if (request || local_handle || frag) {
507 cbfunc = ompi_osc_rdma_put_complete_flush;
508 }
509
510 } else {
511 cbcontext = (void *) sync;
512 cbfunc = ompi_osc_rdma_put_complete;
513 }
514
515
516 if (request) {
517 (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
518 cbcontext = (void *) ((intptr_t) request | 1);
519 request->sync = sync;
520 }
521
522 ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, cbfunc,
523 cbcontext, frag);
524 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
525 ompi_osc_rdma_cleanup_rdma (sync, false, frag, local_handle, request);
526 }
527
528 return ret;
529 }
530
531 static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
532 void *local_address, mca_btl_base_registration_handle_t *local_handle,
533 void *context, void *data, int status)
534 {
535 ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
536 intptr_t source = (intptr_t) local_address + request->offset;
537 ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
538 ompi_osc_rdma_sync_t *sync = request->sync;
539 void *origin_addr = request->origin_addr;
540
541 OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl get complete on sync %p. local "
542 "address %p. origin %p. opal status %d", (void *) sync, local_address, origin_addr, status);
543
544 assert (OPAL_SUCCESS == status);
545
546 if (request->buffer || frag) {
547 if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
548 memcpy (origin_addr, (void *) source, request->len);
549 }
550 }
551
552 if (NULL == request->buffer) {
553
554 ompi_osc_rdma_sync_rdma_dec (sync);
555 } else {
556
557 ompi_osc_rdma_sync_rdma_dec_always (sync);
558 }
559
560 if (NULL != frag) {
561 ompi_osc_rdma_frag_complete (frag);
562 } else {
563 ompi_osc_rdma_deregister (sync->module, local_handle);
564 }
565
566 ompi_osc_rdma_request_complete (request, status);
567 }
568
569 static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
570 mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
571 ompi_osc_rdma_request_t *request) {
572 ompi_osc_rdma_module_t *module = sync->module;
573 ompi_osc_rdma_request_t *subreq;
574 int ret;
575
576 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
577 subreq->internal = true;
578 subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
579 subreq->parent_request = request;
580 (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
581
582 ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq);
583 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
584 OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
585 ompi_osc_rdma_request_deref (request);
586 }
587
588 return ret;
589 }
590
591 static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
592 mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
593 ompi_osc_rdma_request_t *request)
594 {
595 ompi_osc_rdma_module_t *module = sync->module;
596 const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
597 mca_btl_base_registration_handle_t *local_handle = NULL;
598 ompi_osc_rdma_frag_t *frag = NULL;
599 osc_rdma_size_t aligned_len;
600 osc_rdma_base_t aligned_source_base, aligned_source_bound;
601 char *ptr = target_buffer;
602 bool counter_needs_inc = false;
603 int ret;
604
605 aligned_source_base = source_address & ~btl_alignment_mask;
606 aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
607 aligned_len = aligned_source_bound - aligned_source_base;
608
609 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p",
610 size, source_address, target_buffer);
611
612 if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
613 (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
614
615 ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
616 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
617 if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
618
619 size_t subsize;
620
621 if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) {
622
623
624 aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t);
625 subsize = (size_t) (aligned_source_base - source_address);
626
627 ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request);
628 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
629 return ret;
630 }
631
632 source_address += subsize;
633 target_buffer = (void *) ((intptr_t) target_buffer + subsize);
634 size -= subsize;
635
636 aligned_len = aligned_source_bound - aligned_source_base;
637 }
638
639 if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) &&
640 (size & btl_alignment_mask)) {
641
642
643 aligned_len = size & ~btl_alignment_mask;
644 subsize = size - aligned_len;
645 size = aligned_len;
646 ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle,
647 (void *) ((intptr_t) target_buffer + aligned_len), subsize, request);
648 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
649 return ret;
650 }
651 }
652
653 }
654
655 if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
656
657 request->buffer = ptr = malloc (aligned_len);
658 } else {
659 ptr = target_buffer;
660 }
661
662 if (NULL != ptr) {
663 (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE,
664 &local_handle);
665 }
666
667 if (OPAL_UNLIKELY(NULL == local_handle)) {
668 free (request->buffer);
669 request->buffer = NULL;
670 return ret;
671 }
672 } else {
673 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx",
674 (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base);
675 local_handle = frag->handle;
676 }
677 }
678
679 request->offset = source_address - aligned_source_base;
680 request->len = size;
681 request->origin_addr = target_buffer;
682 request->sync = sync;
683
684 if (request->buffer) {
685
686
687 counter_needs_inc = true;
688 ompi_osc_rdma_sync_rdma_inc_always (sync);
689 } else {
690
691
692
693
694 ompi_osc_rdma_sync_rdma_inc (sync);
695 }
696
697 do {
698 ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
699 aligned_source_base, local_handle, source_handle,
700 aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
701 request, frag);
702 if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
703 return OMPI_SUCCESS;
704 }
705
706 ++module->get_retry_count;
707
708 if (!ompi_osc_rdma_oor (ret)) {
709 break;
710 }
711
712
713 for (int i = 0 ; i < 10 ; ++i) {
714 ompi_osc_rdma_progress (module);
715 }
716 } while (1);
717
718 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret);
719
720 ompi_osc_rdma_cleanup_rdma (sync, counter_needs_inc, frag, local_handle, request);
721
722 return ret;
723 }
724
725 static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
726 ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer,
727 ptrdiff_t target_disp, int target_count,
728 ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
729 {
730 ompi_osc_rdma_module_t *module = sync->module;
731 mca_btl_base_registration_handle_t *target_handle;
732 uint64_t target_address;
733 int ret;
734
735
736 if (0 == origin_count || 0 == target_count) {
737 if (request) {
738 ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
739 }
740
741 return OMPI_SUCCESS;
742 }
743
744 ptrdiff_t len, offset;
745
746
747 len = opal_datatype_span(&target_datatype->super, target_count, &offset);
748
749
750
751 ret = osc_rdma_get_remote_segment (module, peer, target_disp, offset+len,
752 &target_address, &target_handle);
753 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
754 return ret;
755 }
756
757
758 if (ompi_osc_rdma_peer_local_base (peer)) {
759 return ompi_osc_rdma_copy_local (origin_addr, origin_count, origin_datatype, (void *) (intptr_t) target_address,
760 target_count, target_datatype, request);
761 }
762
763 return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle,
764 target_count, target_datatype, request, module->selected_btl->btl_put_limit,
765 ompi_osc_rdma_put_contig, false);
766 }
767
768 static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
769 ompi_osc_rdma_peer_t *peer, ptrdiff_t source_disp, int source_count,
770 ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
771 {
772 ompi_osc_rdma_module_t *module = sync->module;
773 mca_btl_base_registration_handle_t *source_handle;
774 uint64_t source_address;
775 ptrdiff_t source_span, source_lb;
776 int ret;
777
778
779 if (0 == origin_count || 0 == source_count) {
780 if (request) {
781 ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
782 }
783
784 return OMPI_SUCCESS;
785 }
786
787
788
789 source_span = opal_datatype_span(&source_datatype->super, source_count, &source_lb);
790
791 ret = osc_rdma_get_remote_segment (module, peer, source_disp, source_span+source_lb,
792 &source_address, &source_handle);
793 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
794 return ret;
795 }
796
797
798 if (ompi_osc_rdma_peer_local_base (peer)) {
799 return ompi_osc_rdma_copy_local ((void *) (intptr_t) source_address, source_count, source_datatype,
800 origin_addr, origin_count, origin_datatype, request);
801 }
802
803 return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address,
804 source_handle, source_count, source_datatype, request,
805 module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true);
806 }
807 int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
808 int target_rank, ptrdiff_t target_disp, int target_count,
809 ompi_datatype_t *target_datatype, ompi_win_t *win)
810 {
811 ompi_osc_rdma_module_t *module = GET_MODULE(win);
812 ompi_osc_rdma_peer_t *peer;
813 ompi_osc_rdma_sync_t *sync;
814
815 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
816 origin_count, origin_datatype->name, target_rank, (int) target_disp, target_count,
817 target_datatype->name, win->w_name);
818
819 sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
820 if (OPAL_UNLIKELY(NULL == sync)) {
821 return OMPI_ERR_RMA_SYNC;
822 }
823
824 return ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
825 target_count, target_datatype, NULL);
826 }
827
828 int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
829 int target_rank, ptrdiff_t target_disp, int target_count,
830 ompi_datatype_t *target_datatype, ompi_win_t *win,
831 ompi_request_t **request)
832 {
833 ompi_osc_rdma_module_t *module = GET_MODULE(win);
834 ompi_osc_rdma_peer_t *peer;
835 ompi_osc_rdma_request_t *rdma_request;
836 ompi_osc_rdma_sync_t *sync;
837 int ret;
838
839 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "rput: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr, origin_count,
840 origin_datatype->name, target_rank, (int) target_disp, target_count, target_datatype->name, win->w_name);
841
842 sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
843 if (OPAL_UNLIKELY(NULL == sync)) {
844 return OMPI_ERR_RMA_SYNC;
845 }
846
847 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
848
849 rdma_request->type = OMPI_OSC_RDMA_TYPE_PUT;
850
851 ret = ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
852 target_count, target_datatype, rdma_request);
853 if (OMPI_SUCCESS != ret) {
854 OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
855 return ret;
856 }
857
858 *request = (ompi_request_t *) rdma_request;
859
860 return OMPI_SUCCESS;
861 }
862
863 int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
864 int source_rank, ptrdiff_t source_disp, int source_count,
865 ompi_datatype_t *source_datatype, ompi_win_t *win)
866 {
867 ompi_osc_rdma_module_t *module = GET_MODULE(win);
868 ompi_osc_rdma_peer_t *peer;
869 ompi_osc_rdma_sync_t *sync;
870
871 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
872 origin_count, origin_datatype->name, source_rank, (int) source_disp, source_count,
873 source_datatype->name, win->w_name);
874
875 sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
876 if (OPAL_UNLIKELY(NULL == sync)) {
877 return OMPI_ERR_RMA_SYNC;
878 }
879
880 return ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
881 source_disp, source_count, source_datatype, NULL);
882 }
883
884 int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype,
885 int source_rank, ptrdiff_t source_disp, int source_count,
886 ompi_datatype_t *source_datatype, ompi_win_t *win,
887 ompi_request_t **request)
888 {
889 ompi_osc_rdma_module_t *module = GET_MODULE(win);
890 ompi_osc_rdma_peer_t *peer;
891 ompi_osc_rdma_request_t *rdma_request;
892 ompi_osc_rdma_sync_t *sync;
893 int ret;
894
895 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "rget: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr,
896 origin_count, origin_datatype->name, source_rank, (int) source_disp, source_count,
897 source_datatype->name, win->w_name);
898
899 sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
900 if (OPAL_UNLIKELY(NULL == sync)) {
901 return OMPI_ERR_RMA_SYNC;
902 }
903
904 OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
905
906 rdma_request->type = OMPI_OSC_RDMA_TYPE_GET;
907 ret = ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
908 source_disp, source_count, source_datatype, rdma_request);
909 if (OMPI_SUCCESS != ret) {
910 OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
911 return ret;
912 }
913
914 *request = (ompi_request_t *) rdma_request;
915
916 return OMPI_SUCCESS;
917 }