This source file includes following definitions.
- get_ptl_id
- to_mxm_mkey
- mca_spml_irkit_req_wait
- free_put_req
- alloc_put_req
- free_get_req
- alloc_get_req
- mca_spml_ikrit_cache_mkeys
- mca_spml_ikrit_get_mkey_slow
- mca_spml_ikrit_enable
- mxm_peer_construct
- mxm_peer_destruct
- mca_spml_ikrit_del_procs
- mca_spml_ikrit_add_procs
- mca_spml_ikrit_register
- mca_spml_ikrit_deregister
- mca_spml_ikrit_oob_get_mkeys
- mca_spml_ikrit_ctx_create
- mca_spml_ikrit_ctx_destroy
- mca_spml_ikrit_get_helper
- mca_spml_ikrit_get_shm
- mca_spml_ikrit_get_nb
- mca_spml_ikrit_get
- get_completion_cb
- mca_spml_ikrit_get_async
- fence_completion_cb
- mca_spml_ikrit_mxm_fence
- put_completion_cb
- mca_spml_ikrit_put_internal
- mca_spml_ikrit_put_simple
- mca_spml_ikrit_put_nb
- mca_spml_ikrit_put
- mca_spml_ikrit_fence
- mca_spml_ikrit_recv
- mca_spml_ikrit_send
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #define _GNU_SOURCE
17 #include <stdio.h>
18
19 #include <sys/types.h>
20 #include <unistd.h>
21 #include <stdint.h>
22
23 #include "oshmem_config.h"
24 #include "opal/datatype/opal_convertor.h"
25 #include "opal/mca/memchecker/base/base.h"
26 #include "opal/util/show_help.h"
27 #include "oshmem/mca/spml/ikrit/spml_ikrit.h"
28 #include "oshmem/include/shmem.h"
29 #include "oshmem/mca/memheap/memheap.h"
30 #include "oshmem/mca/memheap/base/base.h"
31 #include "oshmem/proc/proc.h"
32 #include "oshmem/mca/spml/base/base.h"
33 #include "oshmem/mca/spml/base/spml_base_putreq.h"
34 #include "oshmem/runtime/runtime.h"
35 #include "oshmem/mca/sshmem/sshmem.h"
36
37 #include "oshmem/mca/spml/ikrit/spml_ikrit_component.h"
38
39
40 #ifndef SPML_IKRIT_PUT_DEBUG
41 #define SPML_IKRIT_PUT_DEBUG 0
42 #endif
43
44 #define SPML_IKRIT_MXM_POST_SEND(sreq) \
45 do { \
46 mxm_error_t err; \
47 err = mxm_req_send(&sreq); \
48 if (MXM_OK != err) { \
49 SPML_ERROR("mxm_req_send (op=%d) failed: %s - aborting", \
50 sreq.opcode, \
51 mxm_error_string(err)); \
52 oshmem_shmem_abort(-1); \
53 return OSHMEM_ERROR; \
54 } \
55 } while(0)
56
57 static int mca_spml_ikrit_get_async(void *src_addr,
58 size_t size,
59 void *dst_addr,
60 int src);
61
62 mca_spml_ikrit_ctx_t mca_spml_ikrit_ctx_default = { 0 };
63
64 struct mca_spml_ikrit_put_request {
65 opal_free_list_item_t link;
66 mxm_send_req_t mxm_req;
67 int pe;
68 };
69
70 typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t;
71
72
73 static inline int get_ptl_id(int dst)
74 {
75 return mca_spml_ikrit.mxm_peers[dst].ptl_id;
76 }
77
78 static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) {
79
80 if (0 == mkey->len) {
81 return &mxm_empty_mem_key;
82 }
83 return (mxm_mem_key_t *)mkey->u.data;
84 }
85
86 static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req)
87 {
88 do {
89
90
91
92
93 opal_progress();
94 } while (!mxm_req_test(req));
95 }
96
97 static inline void free_put_req(mca_spml_ikrit_put_request_t *put_req)
98 {
99 opal_free_list_return (&mca_spml_base_put_requests,
100 (opal_free_list_item_t*)put_req);
101 opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req));
102 }
103
104 static inline mca_spml_ikrit_put_request_t *alloc_put_req(void)
105 {
106 mca_spml_ikrit_put_request_t *req;
107 opal_free_list_item_t* item;
108
109 item = opal_free_list_wait (&mca_spml_base_put_requests);
110 assert(item != NULL);
111
112 req = (mca_spml_ikrit_put_request_t *) item;
113 opal_memchecker_base_mem_undefined(req, sizeof(*req));
114
115 return req;
116 }
117
118
119 struct mca_spml_ikrit_get_request {
120 opal_free_list_item_t link;
121 mxm_send_req_t mxm_req;
122 };
123
124 typedef struct mca_spml_ikrit_get_request mca_spml_ikrit_get_request_t;
125
126 static inline void free_get_req(mca_spml_ikrit_get_request_t *get_req)
127 {
128 opal_free_list_return (&mca_spml_base_get_requests,
129 (opal_free_list_item_t*)get_req);
130 opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req));
131 }
132
133 static inline mca_spml_ikrit_get_request_t *alloc_get_req(void)
134 {
135 mca_spml_ikrit_get_request_t *req;
136 opal_free_list_item_t* item;
137
138 item = opal_free_list_wait (&mca_spml_base_get_requests);
139 assert(item != NULL);
140
141 req = (mca_spml_ikrit_get_request_t *) item;
142 opal_memchecker_base_mem_undefined(req, sizeof(*req));
143 return req;
144 }
145
146
147 int mca_spml_ikrit_put_simple(void* dst_addr,
148 size_t size,
149 void* src_addr,
150 int dst);
151
152 static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *, uint32_t seg, int remote_pe, int tr_id);
153
154 static mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva);
155
156 mca_spml_ikrit_t mca_spml_ikrit = {
157 {
158
159 mca_spml_ikrit_add_procs,
160 mca_spml_ikrit_del_procs,
161 mca_spml_ikrit_enable,
162 mca_spml_ikrit_register,
163 mca_spml_ikrit_deregister,
164 mca_spml_ikrit_oob_get_mkeys,
165 mca_spml_ikrit_ctx_create,
166 mca_spml_ikrit_ctx_destroy,
167 mca_spml_ikrit_put,
168 mca_spml_ikrit_put_nb,
169 mca_spml_ikrit_get,
170 mca_spml_ikrit_get_nb,
171 mca_spml_ikrit_recv,
172 mca_spml_ikrit_send,
173 mca_spml_base_wait,
174 mca_spml_base_wait_nb,
175 mca_spml_base_test,
176 mca_spml_ikrit_fence,
177 mca_spml_ikrit_fence,
178 mca_spml_ikrit_cache_mkeys,
179 mca_spml_base_rmkey_free,
180 mca_spml_base_rmkey_ptr,
181 mca_spml_base_memuse_hook,
182 mca_spml_base_put_all_nb,
183
184 (void*)&mca_spml_ikrit
185 },
186 mca_spml_ikrit_get_mkey_slow
187 };
188
189 static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *mkey, uint32_t seg, int dst_pe, int tr_id)
190 {
191 mxm_peer_t *peer;
192
193 if (MXM_PTL_RDMA != tr_id) {
194 return;
195 }
196
197 peer = &mca_spml_ikrit.mxm_peers[dst_pe];
198 mkey_segment_init(&peer->mkeys[seg].super, mkey, seg);
199
200 if (0 != mkey->len) {
201 memcpy(&peer->mkeys[seg].key, mkey->u.data, mkey->len);
202 } else {
203 memcpy(&peer->mkeys[seg].key, &mxm_empty_mem_key, sizeof(mxm_empty_mem_key));
204 }
205 }
206
207 static
208 mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva)
209 {
210 sshmem_mkey_t *mkey;
211
212 retry:
213 mkey = mca_memheap_base_get_cached_mkey(pe, va, ptl_id, rva);
214 if (NULL == mkey) {
215 SPML_ERROR("pe=%d: %p is not address of shared variable", pe, va);
216 oshmem_shmem_abort(-1);
217 return NULL;
218 }
219
220 if (MXM_PTL_SHM == ptl_id) {
221 if (mca_memheap_base_can_local_copy(mkey, va)) {
222 return NULL;
223 }
224
225
226
227
228 if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) {
229 mca_spml_ikrit.mxm_peers[pe].ptl_id = MXM_PTL_RDMA;
230 }
231
232 ptl_id = MXM_PTL_RDMA;
233 goto retry;
234 }
235
236 return to_mxm_mkey(mkey);
237 }
238
239 int mca_spml_ikrit_enable(bool enable)
240 {
241 SPML_VERBOSE(50, "*** ikrit ENABLED ****");
242 if (false == enable) {
243 return OSHMEM_SUCCESS;
244 }
245
246 opal_free_list_init (&mca_spml_base_put_requests,
247 sizeof(mca_spml_ikrit_put_request_t),
248 opal_cache_line_size,
249 OBJ_CLASS(opal_free_list_item_t),
250 0,
251 opal_cache_line_size,
252 mca_spml_ikrit.free_list_num,
253 mca_spml_ikrit.free_list_max,
254 mca_spml_ikrit.free_list_inc,
255 NULL, 0, NULL, NULL, NULL);
256
257 opal_free_list_init (&mca_spml_base_get_requests,
258 sizeof(mca_spml_ikrit_get_request_t),
259 opal_cache_line_size,
260 OBJ_CLASS(opal_free_list_item_t),
261 0,
262 opal_cache_line_size,
263 mca_spml_ikrit.free_list_num,
264 mca_spml_ikrit.free_list_max,
265 mca_spml_ikrit.free_list_inc,
266 NULL, 0, NULL, NULL, NULL);
267
268 mca_spml_ikrit.enabled = true;
269
270 return OSHMEM_SUCCESS;
271 }
272
273 static void mxm_peer_construct(mxm_peer_t *p)
274 {
275 p->n_active_puts = 0;
276 p->need_fence = 0;
277 p->ptl_id = MXM_PTL_RDMA;
278 OBJ_CONSTRUCT(&p->link, opal_list_item_t);
279 }
280
281 static void mxm_peer_destruct(mxm_peer_t *p)
282 {
283 OBJ_DESTRUCT(&p->link);
284 }
285
286 int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs)
287 {
288 size_t i, n;
289 int my_rank = oshmem_my_proc_id();
290
291 oshmem_shmem_barrier();
292 if (mca_spml_ikrit.bulk_disconnect) {
293 mxm_ep_powerdown(mca_spml_ikrit.mxm_ep);
294 }
295
296 while (NULL != opal_list_remove_first(&mca_spml_ikrit.active_peers)) {
297 };
298 OBJ_DESTRUCT(&mca_spml_ikrit.active_peers);
299
300 for (n = 0; n < nprocs; n++) {
301 i = (my_rank + n) % nprocs;
302 mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_conn);
303 if (mca_spml_ikrit.hw_rdma_channel) {
304 assert(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i].mxm_conn);
305 mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn);
306 }
307 mxm_peer_destruct(&mca_spml_ikrit.mxm_peers[i]);
308 }
309 free(mca_spml_ikrit.mxm_peers);
310
311 return OSHMEM_SUCCESS;
312 }
313
314 int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs)
315 {
316 spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL;
317 spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL;
318 spml_ikrit_mxm_ep_conn_info_t my_ep_info;
319 size_t mxm_addr_len = MXM_MAX_ADDR_LEN;
320 mxm_error_t err;
321 size_t i, n;
322 int rc = OSHMEM_ERROR;
323 ompi_proc_t *proc_self;
324 int my_rank = oshmem_my_proc_id();
325
326 OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t);
327
328 ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
329 if (NULL == ep_info) {
330 rc = OSHMEM_ERR_OUT_OF_RESOURCE;
331 goto bail;
332 }
333
334 if (mca_spml_ikrit.hw_rdma_channel) {
335 ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
336 if (NULL == ep_hw_rdma_info) {
337 rc = OSHMEM_ERR_OUT_OF_RESOURCE;
338 goto bail;
339 }
340 }
341
342 mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t));
343 if (NULL == mca_spml_ikrit.mxm_peers) {
344 rc = OSHMEM_ERR_OUT_OF_RESOURCE;
345 goto bail;
346 }
347
348 memset(&my_ep_info, 0, sizeof(my_ep_info));
349
350 if (mca_spml_ikrit.hw_rdma_channel) {
351 err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
352 if (MXM_OK != err) {
353 opal_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
354 mxm_error_string(err));
355 rc = OSHMEM_ERROR;
356 goto bail;
357 }
358 oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info,
359 sizeof(spml_ikrit_mxm_ep_conn_info_t));
360 }
361 err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
362 if (MXM_OK != err) {
363 opal_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
364 mxm_error_string(err));
365 rc = OSHMEM_ERROR;
366 goto bail;
367 }
368
369 oshmem_shmem_allgather(&my_ep_info, ep_info,
370 sizeof(spml_ikrit_mxm_ep_conn_info_t));
371
372 opal_progress_register(spml_ikrit_progress);
373
374
375 for (n = 0; n < nprocs; ++n) {
376
377
378
379 i = (my_rank + n) % nprocs;
380 mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]);
381
382 err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn);
383 if (MXM_OK != err) {
384 SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
385 goto bail;
386 }
387 mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]);
388 if (mca_spml_ikrit.hw_rdma_channel) {
389 err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn);
390 if (MXM_OK != err) {
391 SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
392 goto bail;
393 }
394 } else {
395 mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn;
396 }
397 }
398
399 if (ep_info)
400 free(ep_info);
401 if (ep_hw_rdma_info)
402 free(ep_hw_rdma_info);
403
404 if (mca_spml_ikrit.bulk_connect) {
405
406 oshmem_shmem_barrier();
407 mxm_ep_wireup(mca_spml_ikrit.mxm_ep);
408 }
409
410 proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank);
411
412 for (i = 0; i < nprocs; i++) {
413 if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid ||
414 !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) {
415 continue;
416 }
417 if (procs[i] == proc_self)
418 continue;
419
420
421 mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM;
422 }
423
424 SPML_VERBOSE(50, "*** ADDED PROCS ***");
425 return OSHMEM_SUCCESS;
426
427 bail:
428 if (ep_info)
429 free(ep_info);
430 if (ep_hw_rdma_info)
431 free(ep_hw_rdma_info);
432 SPML_ERROR("add procs FAILED rc=%d", rc);
433
434 return rc;
435
436 }
437
438 sshmem_mkey_t *mca_spml_ikrit_register(void* addr,
439 size_t size,
440 uint64_t shmid,
441 int *count)
442 {
443 int i;
444 sshmem_mkey_t *mkeys;
445 mxm_error_t err;
446 mxm_mem_key_t *m_key;
447 int my_rank = oshmem_my_proc_id();
448
449 *count = 0;
450 mkeys = (sshmem_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys));
451 if (!mkeys) {
452 return NULL ;
453 }
454
455 for (i = 0; i < MXM_PTL_LAST; i++) {
456 mkeys[i].u.key = MAP_SEGMENT_SHM_INVALID;
457 switch (i) {
458 case MXM_PTL_SHM:
459 if ((int)shmid != MAP_SEGMENT_SHM_INVALID) {
460 mkeys[i].u.key = shmid;
461 mkeys[i].va_base = 0;
462 } else {
463 mkeys[i].len = 0;
464 mkeys[i].va_base = addr;
465 }
466 mkeys[i].spml_context = 0;
467 break;
468 case MXM_PTL_RDMA:
469 mkeys[i].va_base = addr;
470 mkeys[i].spml_context = 0;
471
472 if (mca_spml_ikrit.ud_only) {
473 mkeys[i].len = 0;
474 break;
475 }
476
477 err = mxm_mem_map(mca_spml_ikrit.mxm_context, &addr, &size, 0, 0, 0);
478 if (MXM_OK != err) {
479 SPML_ERROR("Failed to register memory: %s", mxm_error_string(err));
480 goto error_out;
481 }
482 mkeys[i].spml_context = (void *)(unsigned long)size;
483
484 m_key = malloc(sizeof(*m_key));
485 if (NULL == m_key) {
486 SPML_ERROR("Failed to allocate m_key memory");
487 goto error_out;
488 }
489 mkeys[i].len = sizeof(*m_key);
490 mkeys[i].u.data = m_key;
491
492 err = mxm_mem_get_key(mca_spml_ikrit.mxm_context, addr, m_key);
493 if (MXM_OK != err) {
494 SPML_ERROR("Failed to get memory key: %s", mxm_error_string(err));
495 goto error_out;
496 }
497 break;
498
499 default:
500 SPML_ERROR("unsupported PTL: %d", i);
501 goto error_out;
502 }
503 SPML_VERBOSE(5,
504 "rank %d ptl %d addr %p size %llu %s",
505 my_rank, i, addr, (unsigned long long)size,
506 mca_spml_base_mkey2str(&mkeys[i]));
507
508 mca_spml_ikrit_cache_mkeys(&mkeys[i], memheap_find_segnum(addr), my_rank, i);
509 }
510 *count = MXM_PTL_LAST;
511
512 return mkeys;
513
514 error_out:
515 mca_spml_ikrit_deregister(mkeys);
516
517 return NULL;
518 }
519
520 int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys)
521 {
522 int i;
523
524 MCA_SPML_CALL(fence(oshmem_ctx_default));
525 if (!mkeys)
526 return OSHMEM_SUCCESS;
527
528 for (i = 0; i < MXM_PTL_LAST; i++) {
529 switch (i) {
530 case MXM_PTL_SHM:
531 break;
532 case MXM_PTL_RDMA:
533
534 if (!mkeys[i].spml_context)
535 break;
536 mxm_mem_unmap(mca_spml_ikrit.mxm_context,
537 (void *)mkeys[i].va_base,
538 (unsigned long)mkeys[i].spml_context,
539 0);
540 if (0 < mkeys[i].len) {
541 free(mkeys[i].u.data);
542 }
543 break;
544 }
545 }
546 free(mkeys);
547
548 return OSHMEM_SUCCESS;
549
550 }
551
552 int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys)
553 {
554 int ptl;
555
556 ptl = get_ptl_id(pe);
557 if (ptl < 0)
558 return OSHMEM_ERROR;
559
560 if (ptl != MXM_PTL_RDMA)
561 return OSHMEM_ERROR;
562
563
564
565
566 if (mca_spml_ikrit.ud_only) {
567
568 mkeys[ptl].len = 0;
569 mkeys[ptl].va_base = mca_memheap_seg2base_va(seg);
570 mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID;
571 mca_spml_ikrit_cache_mkeys(&mkeys[ptl], seg, pe, ptl);
572 return OSHMEM_SUCCESS;
573 }
574
575 return OSHMEM_ERROR;
576 }
577
578 int mca_spml_ikrit_ctx_create(long options, shmem_ctx_t *ctx)
579 {
580 int rc = OSHMEM_SUCCESS;
581 mca_spml_ikrit_ctx_t *ctxp = malloc(sizeof(mca_spml_ikrit_ctx_t));
582 *ctx = (shmem_ctx_t)ctxp;
583 return rc;
584 }
585
586 void mca_spml_ikrit_ctx_destroy(shmem_ctx_t ctx)
587 {
588 free(ctx);
589 }
590
591 static inline int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq,
592 void *src_addr,
593 size_t size,
594 void *dst_addr,
595 int src)
596 {
597
598
599 void *rva;
600 mxm_mem_key_t *mkey;
601
602 mkey = mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_RDMA, &rva, &mca_spml_ikrit);
603
604 SPML_VERBOSE_FASTPATH(100,
605 "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p",
606 src, MXM_PTL_RDMA, src_addr, dst_addr, (int)size, (void *)rva);
607
608
609 sreq->base.mq = mca_spml_ikrit.mxm_mq;
610 sreq->base.conn = mca_spml_ikrit.mxm_peers[src].mxm_conn;
611 sreq->base.data_type = MXM_REQ_DATA_BUFFER;
612 sreq->base.data.buffer.ptr = dst_addr;
613 sreq->base.data.buffer.length = size;
614 sreq->op.mem.remote_mkey = mkey;
615 sreq->opcode = MXM_REQ_OP_GET;
616 sreq->op.mem.remote_vaddr = (intptr_t) rva;
617 sreq->base.state = MXM_REQ_NEW;
618
619 return OSHMEM_SUCCESS;
620 }
621
622 static inline int mca_spml_ikrit_get_shm(void *src_addr,
623 size_t size,
624 void *dst_addr,
625 int src)
626 {
627 int ptl_id;
628 void *rva;
629
630 ptl_id = get_ptl_id(src);
631
632
633
634 if (ptl_id != MXM_PTL_SHM)
635 return OSHMEM_ERROR;
636
637 if (NULL != mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_SHM, &rva, &mca_spml_ikrit))
638 return OSHMEM_ERROR;
639
640 SPML_VERBOSE_FASTPATH(100,
641 "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p",
642 src, src_addr, dst_addr, (int)size, (void *)rva);
643
644 memcpy(dst_addr, (void *) (unsigned long) rva, size);
645 opal_progress();
646 return OSHMEM_SUCCESS;
647 }
648
649 int mca_spml_ikrit_get_nb(shmem_ctx_t ctx,
650 void* src_addr,
651 size_t size,
652 void* dst_addr,
653 int src,
654 void **handle)
655 {
656 return mca_spml_ikrit_get_async(src_addr, size, dst_addr, src);
657 }
658
659 int mca_spml_ikrit_get(shmem_ctx_t ctx, void *src_addr, size_t size, void *dst_addr, int src)
660 {
661 mxm_send_req_t sreq;
662
663 if (0 >= size) {
664 return OSHMEM_SUCCESS;
665 }
666
667 if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
668 return OSHMEM_SUCCESS;
669
670 if (OSHMEM_SUCCESS
671 != mca_spml_ikrit_get_helper(&sreq,
672 src_addr,
673 size,
674 dst_addr,
675 src)) {
676 oshmem_shmem_abort(-1);
677 return OSHMEM_ERROR;
678 }
679
680 sreq.base.completed_cb = NULL;
681 sreq.flags = 0;
682
683 SPML_IKRIT_MXM_POST_SEND(sreq);
684
685 mca_spml_irkit_req_wait(&sreq.base);
686 if (MXM_OK != sreq.base.error) {
687 SPML_ERROR("get request failed: %s - aborting",
688 mxm_error_string(sreq.base.error));
689 oshmem_shmem_abort(-1);
690 return OSHMEM_ERROR;
691 }
692 return OSHMEM_SUCCESS;
693 }
694
695 static inline void get_completion_cb(void *ctx)
696 {
697 mca_spml_ikrit_get_request_t *get_req = (mca_spml_ikrit_get_request_t *) ctx;
698
699 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_gets, -1);
700 free_get_req(get_req);
701 }
702
703 static inline int mca_spml_ikrit_get_async(void *src_addr,
704 size_t size,
705 void *dst_addr,
706 int src)
707 {
708 mca_spml_ikrit_get_request_t *get_req;
709
710 if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
711 return OSHMEM_SUCCESS;
712
713 get_req = alloc_get_req();
714
715 if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req,
716 src_addr,
717 size,
718 dst_addr,
719 src)) {
720 oshmem_shmem_abort(-1);
721 return OSHMEM_ERROR;
722 }
723
724 get_req->mxm_req.flags = 0;
725 get_req->mxm_req.base.completed_cb = get_completion_cb;
726 get_req->mxm_req.base.context = get_req;
727 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_gets, 1);
728
729 SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);
730
731 return OSHMEM_SUCCESS;
732 }
733
734 static inline void fence_completion_cb(void *ctx)
735 {
736 mca_spml_ikrit_get_request_t *fence_req =
737 (mca_spml_ikrit_get_request_t *) ctx;
738
739 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_mxm_fences, -1);
740 free_get_req(fence_req);
741 }
742
743 static int mca_spml_ikrit_mxm_fence(int dst)
744 {
745 mca_spml_ikrit_get_request_t *fence_req;
746
747 fence_req = alloc_get_req();
748
749 fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
750 fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
751 fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
752 fence_req->mxm_req.flags = MXM_REQ_SEND_FLAG_FENCE;
753 fence_req->mxm_req.op.mem.remote_vaddr = 0;
754 fence_req->mxm_req.op.mem.remote_mkey = &mxm_empty_mem_key;
755 fence_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
756 fence_req->mxm_req.base.data.buffer.ptr = 0;
757 fence_req->mxm_req.base.data.buffer.length = 0;
758 fence_req->mxm_req.base.state = MXM_REQ_NEW;
759 fence_req->mxm_req.base.completed_cb = fence_completion_cb;
760 fence_req->mxm_req.base.context = fence_req;
761 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_mxm_fences, 1);
762
763 SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
764 return OSHMEM_SUCCESS;
765 }
766
767 static inline void put_completion_cb(void *ctx)
768 {
769 mca_spml_ikrit_put_request_t *put_req = (mca_spml_ikrit_put_request_t *) ctx;
770 mxm_peer_t *peer;
771
772 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_puts, -1);
773
774 peer = &mca_spml_ikrit.mxm_peers[put_req->pe];
775
776
777 #if SPML_IKRIT_PUT_DEBUG == 1
778 if (peer) {
779 if (peer->n_active_puts <= 0) {
780
781 SPML_VERBOSE(1, "pe %d n_active_puts %d", put_req->pe, peer->n_active_puts);
782 }
783 }
784
785 if (put_req->mxm_req.base.state != MXM_REQ_COMPLETED)
786 SPML_ERROR("oops: pe %d uncompleted request state %d", put_req->pe, put_req->mxm_req.base.state);
787 #endif
788
789 if (0 < peer->n_active_puts) {
790 peer->n_active_puts--;
791 if (0 == peer->n_active_puts &&
792 (put_req->mxm_req.opcode == MXM_REQ_OP_PUT_SYNC)) {
793 opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->link);
794 peer->need_fence = 0;
795 }
796 }
797
798 free_put_req(put_req);
799 }
800
801
802
803
804 static inline int mca_spml_ikrit_put_internal(void* dst_addr,
805 size_t size,
806 void* src_addr,
807 int dst,
808 void **handle,
809 int zcopy)
810 {
811 void *rva;
812 mca_spml_ikrit_put_request_t *put_req;
813 int ptl_id;
814 static int count;
815 int need_progress = 0;
816 mxm_mem_key_t *mkey;
817
818 if (OPAL_UNLIKELY(0 >= size)) {
819 return OSHMEM_SUCCESS;
820 }
821
822 ptl_id = get_ptl_id(dst);
823 mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit);
824
825 if (OPAL_UNLIKELY(NULL == mkey)) {
826 memcpy((void *) (unsigned long) rva, src_addr, size);
827
828 if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
829 mxm_progress(mca_spml_ikrit.mxm_context);
830 return OSHMEM_SUCCESS;
831 }
832
833 SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
834 dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva);
835
836 put_req = alloc_put_req();
837
838 if (handle)
839 *handle = put_req;
840
841
842 put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
843
844
845 put_req->mxm_req.flags = 0;
846 if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER ||
847 (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max ||
848 (mca_spml_ikrit.mxm_peers[dst].n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) {
849 need_progress = 1;
850 put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
851 } else {
852 put_req->mxm_req.opcode = MXM_REQ_OP_PUT;
853 }
854 if (!zcopy) {
855 if (size < mca_spml_ikrit.put_zcopy_threshold) {
856 put_req->mxm_req.flags |= MXM_REQ_SEND_FLAG_BLOCKING;
857 } else {
858 put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
859 }
860 }
861
862 put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
863 put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
864 put_req->mxm_req.base.data.buffer.ptr = src_addr;
865 put_req->mxm_req.base.data.buffer.length = size;
866 put_req->mxm_req.base.completed_cb = put_completion_cb;
867 put_req->mxm_req.base.context = put_req;
868 put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
869 put_req->mxm_req.base.state = MXM_REQ_NEW;
870 put_req->pe = dst;
871
872 put_req->mxm_req.op.mem.remote_mkey = mkey;
873
874 OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_puts, 1);
875 if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) {
876 opal_list_append(&mca_spml_ikrit.active_peers,
877 &mca_spml_ikrit.mxm_peers[dst].link);
878 mca_spml_ikrit.mxm_peers[dst].need_fence = 1;
879 }
880
881 mca_spml_ikrit.mxm_peers[dst].n_active_puts++;
882
883 SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);
884
885 if (need_progress)
886 mxm_progress(mca_spml_ikrit.mxm_context);
887
888 return OSHMEM_SUCCESS;
889 }
890
891
892
893
894
895
896
897 int mca_spml_ikrit_put_simple(void* dst_addr,
898 size_t size,
899 void* src_addr,
900 int dst)
901 {
902 void *rva;
903 mxm_send_req_t mxm_req;
904 mxm_wait_t wait;
905 int ptl_id;
906 mxm_mem_key_t *mkey;
907 static int count;
908
909 ptl_id = get_ptl_id(dst);
910 mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit);
911
912 SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
913 dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva);
914
915 if (NULL == mkey) {
916 memcpy((void *) (unsigned long) rva, src_addr, size);
917
918 if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
919 mxm_progress(mca_spml_ikrit.mxm_context);
920 return OSHMEM_SUCCESS;
921 }
922
923 SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
924 dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva);
925
926
927 mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
928 mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING;
929 mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
930 mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
931 mxm_req.base.data.buffer.ptr = src_addr;
932 mxm_req.base.data.buffer.length = size;
933 mxm_req.base.completed_cb = 0;
934 mxm_req.base.context = 0;
935 mxm_req.opcode = MXM_REQ_OP_PUT;
936 mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
937 mxm_req.base.state = MXM_REQ_NEW;
938 mxm_req.base.error = MXM_OK;
939
940 mxm_req.op.mem.remote_mkey = mkey;
941
942 if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) {
943 opal_list_append(&mca_spml_ikrit.active_peers,
944 &mca_spml_ikrit.mxm_peers[dst].link);
945 mca_spml_ikrit.mxm_peers[dst].need_fence = 1;
946 }
947
948 SPML_IKRIT_MXM_POST_SEND(mxm_req);
949
950 wait.req = &mxm_req.base;
951 wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
952 wait.progress_cb = NULL;
953 wait.progress_arg = NULL;
954 mxm_wait(&wait);
955
956 return OSHMEM_SUCCESS;
957 }
958
959 int mca_spml_ikrit_put_nb(shmem_ctx_t ctx,
960 void* dst_addr,
961 size_t size,
962 void* src_addr,
963 int dst,
964 void **handle)
965 {
966 int err;
967 err = mca_spml_ikrit_put_internal(dst_addr, size, src_addr, dst, handle, 1);
968 if (OSHMEM_SUCCESS != err) {
969 SPML_ERROR("put failed - aborting");
970 oshmem_shmem_abort(-1);
971 return OSHMEM_ERROR;
972 }
973 return OSHMEM_SUCCESS;
974 }
975
976 int mca_spml_ikrit_put(shmem_ctx_t ctx, void* dst_addr, size_t size, void* src_addr, int dst)
977 {
978 int err;
979 mca_spml_ikrit_put_request_t *put_req;
980 mxm_wait_t wait;
981
982 put_req = 0;
983 err = mca_spml_ikrit_put_internal(dst_addr,
984 size,
985 src_addr,
986 dst,
987 (void **) &put_req,
988 0);
989 if (OSHMEM_SUCCESS != err) {
990 SPML_ERROR("put failed - aborting");
991 oshmem_shmem_abort(-1);
992 return OSHMEM_ERROR;
993 }
994 if (!put_req)
995 return OSHMEM_SUCCESS;
996
997 wait.req = &put_req->mxm_req.base;
998 wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
999 wait.progress_cb = NULL;
1000 wait.progress_arg = NULL;
1001 mxm_wait(&wait);
1002
1003 return OSHMEM_SUCCESS;
1004 }
1005
1006
1007 int mca_spml_ikrit_fence(shmem_ctx_t ctx)
1008 {
1009 mxm_peer_t *peer;
1010 opal_list_item_t *item;
1011
1012 SPML_VERBOSE(20,
1013 "Into fence with %d active puts on %d pes",
1014 mca_spml_ikrit.n_active_puts, (int)opal_list_get_size(&mca_spml_ikrit.active_peers));
1015
1016
1017
1018 while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) {
1019 peer = spml_ikrit_container_of(item, mxm_peer_t, link);
1020 peer->n_active_puts = 0;
1021 peer->need_fence = 0;
1022 mca_spml_ikrit_mxm_fence(peer - mca_spml_ikrit.mxm_peers);
1023 }
1024
1025 while (0 < mca_spml_ikrit.n_mxm_fences || 0 < mca_spml_ikrit.n_active_gets) {
1026 opal_progress();
1027 }
1028
1029 SPML_VERBOSE(20, "fence completed");
1030 return OSHMEM_SUCCESS;
1031 }
1032
1033
1034 int mca_spml_ikrit_recv(void* buf, size_t size, int src)
1035 {
1036 mxm_error_t ret = MXM_OK;
1037 mxm_recv_req_t req;
1038 char dummy_buf[1];
1039
1040
1041 SPML_VERBOSE(100,
1042 "want to recv from src %d, size %d buf %p",
1043 src, (int)size, buf);
1044 req.tag = src == SHMEM_ANY_SOURCE ? 0 : src;
1045 req.tag_mask = src == SHMEM_ANY_SOURCE ? 0 : 0xFFFFFFFF;
1046
1047 req.base.state = MXM_REQ_NEW;
1048 req.base.mq = mca_spml_ikrit.mxm_mq;
1049 req.base.conn = NULL;
1050 req.base.completed_cb = NULL;
1051
1052 req.base.data_type = MXM_REQ_DATA_BUFFER;
1053 req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf;
1054 req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
1055 req.base.data.buffer.memh = NULL;
1056
1057 ret = mxm_req_recv(&req);
1058 if (MXM_OK != ret) {
1059 return OSHMEM_ERROR;
1060 }
1061 mca_spml_irkit_req_wait(&req.base);
1062 if (MXM_OK != req.base.error) {
1063 return OSHMEM_ERROR;
1064 }
1065 SPML_VERBOSE(100,
1066 "recvd from tag %d len %d",
1067 req.completion.sender_tag, (int)req.completion.actual_len);
1068
1069 return OSHMEM_SUCCESS;
1070 }
1071
1072
1073 int mca_spml_ikrit_send(void* buf,
1074 size_t size,
1075 int dst,
1076 mca_spml_base_put_mode_t mode)
1077 {
1078 mxm_send_req_t req;
1079 char dummy_buf[1];
1080
1081 SPML_VERBOSE(100,
1082 "sending %p size %d to %d, mode %d",
1083 buf, (int)size, dst, (int)mode);
1084 req.opcode = MXM_REQ_OP_SEND;
1085
1086 req.op.send.tag = oshmem_my_proc_id();
1087
1088 req.base.state = MXM_REQ_NEW;
1089 req.base.mq = mca_spml_ikrit.mxm_mq;
1090 req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
1091 req.flags = MXM_REQ_SEND_FLAG_BLOCKING;
1092 req.base.completed_cb = NULL;
1093
1094 req.base.data_type = MXM_REQ_DATA_BUFFER;
1095 req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf;
1096 req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
1097 req.base.data.buffer.memh = NULL;
1098
1099 SPML_IKRIT_MXM_POST_SEND(req);
1100
1101 mca_spml_irkit_req_wait(&req.base);
1102 if (req.base.error != MXM_OK) {
1103 return OSHMEM_ERROR;
1104 }
1105
1106 return OSHMEM_SUCCESS;
1107 }