1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2009 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2007 Voltaire. All rights reserved.
14 * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
15 * Copyright (c) 2010-2015 Los Alamos National Security, LLC.
16 * All rights reserved.
17 * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 */
24 /**
25 * @file
26 */
27 #ifndef MCA_BTL_SMCUDA_H
28 #define MCA_BTL_SMCUDA_H
29
30 #include "opal_config.h"
31 #include <stddef.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdint.h>
35 #ifdef HAVE_SCHED_H
36 #include <sched.h>
37 #endif /* HAVE_SCHED_H */
38
39 #include "opal/util/bit_ops.h"
40 #include "opal/class/opal_free_list.h"
41 #include "opal/mca/btl/btl.h"
42 #include "opal/mca/common/sm/common_sm.h"
43
44 BEGIN_C_DECLS
45
46 /*
47 * Shared Memory FIFOs
48 *
49 * The FIFO is implemented as a circular queue with head and tail pointers
50 * (integer indices). For efficient wraparound indexing, the size of the
51 * queue is constrained to be a power of two and we "&" indices with a "mask".
52 *
53 * More than one process can write to the FIFO head. Therefore, there is a head
54 * lock. One cannot write until the head slot is empty, indicated by the special
55 * queue entry SM_FIFO_FREE.
56 *
57 * Only the receiver can read the FIFO tail. Therefore, the tail lock is
58 * required only in multithreaded applications. If a tail read returns the
59 * SM_FIFO_FREE value, that means the FIFO is empty. Once a non-FREE value
60 * has been read, the queue slot is *not* automatically reset to SM_FIFO_FREE.
61 * Rather, read tail slots are reset "lazily" (see "lazy_free" and "num_to_clear")
62 * to reduce the number of memory barriers and improve performance.
63 *
64 * Since the FIFO lives in shared memory that is mapped differently into
65 * each address space, the "queue" pointer is relative (each process must
66 * add its own offset) and the queue_recv pointer is meaningful only in the
67 * receiver's address space.
68 *
69 * Since multiple processes access different parts of the FIFO structure in
70 * different ways, we introduce padding to keep different parts on different
71 * cachelines.
72 */
73
74 #define SM_FIFO_FREE (void *) (-2)
75 /* We can't use opal_cache_line_size here because we need a
76 compile-time constant for padding the struct. We can't really have
77 a compile-time constant that is portable, either (e.g., compile on
78 one machine and run on another). So just use a big enough cache
79 line that should hopefully be good in most places. */
80 #define SM_CACHE_LINE_PAD 128
81
82 struct sm_fifo_t {
83 /* This queue pointer is used only by the heads. */
84 volatile void **queue;
85 char pad0[SM_CACHE_LINE_PAD - sizeof(void **)];
86 /* This lock is used by the heads. */
87 opal_atomic_lock_t head_lock;
88 char pad1[SM_CACHE_LINE_PAD - sizeof(opal_atomic_lock_t)];
89 /* This index is used by the head holding the head lock. */
90 volatile int head;
91 char pad2[SM_CACHE_LINE_PAD - sizeof(int)];
92 /* This mask is used "read only" by all processes. */
93 unsigned int mask;
94 char pad3[SM_CACHE_LINE_PAD - sizeof(int)];
95 /* The following are used only by the tail. */
96 volatile void **queue_recv;
97 opal_atomic_lock_t tail_lock;
98 volatile int tail;
99 int num_to_clear;
100 int lazy_free;
101 char pad4[SM_CACHE_LINE_PAD - sizeof(void **) -
102 sizeof(opal_atomic_lock_t) -
103 sizeof(int) * 3];
104 };
105 typedef struct sm_fifo_t sm_fifo_t;
106
107 /*
108 * Shared Memory resource managment
109 */
110
111 #if OPAL_ENABLE_PROGRESS_THREADS == 1
112 #define DATA (char)0
113 #define DONE (char)1
114 #endif
115
116 typedef struct mca_btl_smcuda_mem_node_t {
117 mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
118 } mca_btl_smcuda_mem_node_t;
119
120 /**
121 * Shared Memory (SM) BTL module.
122 */
123 struct mca_btl_smcuda_component_t {
124 mca_btl_base_component_2_0_0_t super; /**< base BTL component */
125 int sm_free_list_num; /**< initial size of free lists */
126 int sm_free_list_max; /**< maximum size of free lists */
127 int sm_free_list_inc; /**< number of elements to alloc when growing free lists */
128 int sm_max_procs; /**< upper limit on the number of processes using the shared memory pool */
129 int sm_extra_procs; /**< number of extra procs to allow */
130 char* sm_mpool_name; /**< name of shared memory pool module */
131 mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node) */
132 mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */
133 void* sm_mpool_base; /**< base address of shared memory pool */
134 size_t eager_limit; /**< first fragment size */
135 size_t max_frag_size; /**< maximum (second and beyone) fragment size */
136 opal_mutex_t sm_lock;
137 mca_common_sm_module_t *sm_seg; /**< description of shared memory segment */
138 volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */
139 char **shm_bases; /**< pointer to base pointers in shared memory */
140 uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */
141 sm_fifo_t **fifo; /**< cached copy of the pointer to the 2D
142 fifo array. The address in the shared
143 memory segment sm_ctl_header is a relative,
144 but this one, in process private memory, is
145 a real virtual address */
146 uint16_t *mem_nodes; /**< cached copy of mem nodes of each local rank */
147 unsigned int fifo_size; /**< number of FIFO queue entries */
148 unsigned int fifo_lazy_free; /**< number of reads before lazy fifo free is triggered */
149 int nfifos; /**< number of FIFOs per receiver */
150 int32_t num_smp_procs; /**< current number of smp procs on this host */
151 int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
152 * SMP specfic data structures. */
153 opal_free_list_t sm_frags_eager; /**< free list of sm first */
154 opal_free_list_t sm_frags_max; /**< free list of sm second */
155 opal_free_list_t sm_frags_user;
156 opal_free_list_t sm_first_frags_to_progress; /**< list of first
157 fragments that are
158 awaiting resources */
159 struct mca_btl_base_endpoint_t **sm_peers;
160
161 opal_free_list_t pending_send_fl;
162 opal_atomic_int32_t num_outstanding_frags; /**< number of fragments sent but not yet returned to free list */
163 opal_atomic_int32_t num_pending_sends; /**< total number on all of my pending-send queues */
164 int mem_node;
165 int num_mem_nodes;
166
167 #if OPAL_ENABLE_PROGRESS_THREADS == 1
168 char sm_fifo_path[PATH_MAX]; /**< path to fifo used to signal this process */
169 int sm_fifo_fd; /**< file descriptor corresponding to opened fifo */
170 opal_thread_t sm_fifo_thread;
171 #endif
172 struct mca_btl_smcuda_t **sm_btls;
173 struct mca_btl_smcuda_frag_t **table;
174 size_t sm_num_btls;
175 size_t sm_max_btls;
176
177
178 /** MCA: should we be using knem or not? neg=try but continue if
179 not available, 0=don't try, 1=try and fail if not available */
180 int use_knem;
181
182 /** MCA: minimal message size (bytes) to offload on DMA engine
183 when using knem */
184 unsigned int knem_dma_min;
185
186 /** MCA: how many simultaneous ongoing knem operations to
187 support */
188 int knem_max_simultaneous;
189
190 /** If we want DMA and DMA is supported, this will be loaded with
191 KNEM_FLAG_DMA. Otherwise, it'll be 0. */
192 int knem_dma_flag;
193
194 /** MCA: should we be using CMA or not?
195 0 = no, 1 = yes */
196 int use_cma;
197
198 /* /// well-known file names for sm and sm mpool init /// */
199 char *sm_mpool_ctl_file_name;
200 char *sm_mpool_rndv_file_name;
201 char *sm_ctl_file_name;
202 char *sm_rndv_file_name;
203 #if OPAL_CUDA_SUPPORT
204 int cuda_ipc_verbose;
205 int cuda_ipc_output;
206 int use_cuda_ipc;
207 int use_cuda_ipc_same_gpu;
208 #endif /* OPAL_CUDA_SUPPORT */
209 unsigned long mpool_min_size;
210 char *allocator;
211 };
212 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
213 OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
214
215 /**
216 * SM BTL Interface
217 */
218 struct mca_btl_smcuda_t {
219 mca_btl_base_module_t super; /**< base BTL interface */
220 bool btl_inited; /**< flag indicating if btl has been inited */
221 mca_btl_base_module_error_cb_fn_t error_cb;
222 mca_rcache_base_module_t *rcache;
223 };
224 typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
225 OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;
226
227 struct btl_smcuda_pending_send_item_t
228 {
229 opal_free_list_item_t super;
230 void *data;
231 };
232 typedef struct btl_smcuda_pending_send_item_t btl_smcuda_pending_send_item_t;
233
234 /***
235 * FIFO support for sm BTL.
236 */
237
238 /***
239 * One or more FIFO components may be a pointer that must be
240 * accessed by multiple processes. Since the shared region may
241 * be mmapped differently into each process's address space,
242 * these pointers will be relative to some base address. Here,
243 * we define macros to translate between relative addresses and
244 * virtual addresses.
245 */
246 #define VIRTUAL2RELATIVE(VADDR ) ((long)(VADDR) - (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
247 #define RELATIVE2VIRTUAL(OFFSET) ((long)(OFFSET) + (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
248
249 static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
250 sm_fifo_t *fifo, int lazy_free)
251 {
252 int i, qsize;
253
254 /* figure out the queue size (a power of two that is at least 1) */
255 qsize = opal_next_poweroftwo_inclusive (fifo_size);
256
257 /* allocate the queue in the receiver's address space */
258 fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
259 mpool, sizeof(void *) * qsize, opal_cache_line_size, 0);
260 if(NULL == fifo->queue_recv) {
261 return OPAL_ERR_OUT_OF_RESOURCE;
262 }
263
264 /* initialize the queue */
265 for ( i = 0; i < qsize; i++ )
266 fifo->queue_recv[i] = SM_FIFO_FREE;
267
268 /* shift queue address to be relative */
269 fifo->queue = (volatile void **) VIRTUAL2RELATIVE(fifo->queue_recv);
270
271 /* initialize the locks */
272 opal_atomic_lock_init(&(fifo->head_lock), OPAL_ATOMIC_LOCK_UNLOCKED);
273 opal_atomic_lock_init(&(fifo->tail_lock), OPAL_ATOMIC_LOCK_UNLOCKED);
274 opal_atomic_unlock(&(fifo->head_lock)); /* should be unnecessary */
275 opal_atomic_unlock(&(fifo->tail_lock)); /* should be unnecessary */
276
277 /* other initializations */
278 fifo->head = 0;
279 fifo->mask = qsize - 1;
280 fifo->tail = 0;
281 fifo->num_to_clear = 0;
282 fifo->lazy_free = lazy_free;
283
284 return OPAL_SUCCESS;
285 }
286
287
288 static inline int sm_fifo_write(void *value, sm_fifo_t *fifo)
289 {
290 volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo->queue);
291
292 /* if there is no free slot to write, report exhausted resource */
293 opal_atomic_rmb();
294 if ( SM_FIFO_FREE != q[fifo->head] )
295 return OPAL_ERR_OUT_OF_RESOURCE;
296
297 /* otherwise, write to the slot and advance the head index */
298 q[fifo->head] = value;
299 opal_atomic_wmb();
300 fifo->head = (fifo->head + 1) & fifo->mask;
301 return OPAL_SUCCESS;
302 }
303
304
305 static inline void *sm_fifo_read(sm_fifo_t *fifo)
306 {
307 void *value;
308
309 /* read the next queue entry */
310 value = (void *) fifo->queue_recv[fifo->tail];
311
312 opal_atomic_rmb();
313
314 /* if you read a non-empty slot, advance the tail pointer */
315 if ( SM_FIFO_FREE != value ) {
316
317 fifo->tail = ( fifo->tail + 1 ) & fifo->mask;
318 fifo->num_to_clear += 1;
319
320 /* check if it's time to free slots, which we do lazily */
321 if ( fifo->num_to_clear >= fifo->lazy_free ) {
322 int i = (fifo->tail - fifo->num_to_clear ) & fifo->mask;
323
324 while ( fifo->num_to_clear > 0 ) {
325 fifo->queue_recv[i] = SM_FIFO_FREE;
326 i = (i+1) & fifo->mask;
327 fifo->num_to_clear -= 1;
328 }
329 opal_atomic_wmb();
330 }
331 }
332
333 return value;
334 }
335
336 /**
337 * shared memory component progress.
338 */
339 extern int mca_btl_smcuda_component_progress(void);
340
341
342
343 /**
344 * Register a callback function that is called on error..
345 *
346 * @param btl (IN) BTL module
347 * @return Status indicating if cleanup was successful
348 */
349
350 int mca_btl_smcuda_register_error_cb(
351 struct mca_btl_base_module_t* btl,
352 mca_btl_base_module_error_cb_fn_t cbfunc
353 );
354
355 /**
356 * Cleanup any resources held by the BTL.
357 *
358 * @param btl BTL instance.
359 * @return OPAL_SUCCESS or error status on failure.
360 */
361
362 extern int mca_btl_smcuda_finalize(
363 struct mca_btl_base_module_t* btl
364 );
365
366
367 /**
368 * PML->BTL notification of change in the process list.
369 * PML->BTL Notification that a receive fragment has been matched.
370 * Called for message that is send from process with the virtual
371 * address of the shared memory segment being different than that of
372 * the receiver.
373 *
374 * @param btl (IN)
375 * @param proc (IN)
376 * @param peer (OUT)
377 * @return OPAL_SUCCESS or error status on failure.
378 *
379 */
380
381 extern int mca_btl_smcuda_add_procs(
382 struct mca_btl_base_module_t* btl,
383 size_t nprocs,
384 struct opal_proc_t **procs,
385 struct mca_btl_base_endpoint_t** peers,
386 struct opal_bitmap_t* reachability
387 );
388
389
390 /**
391 * PML->BTL notification of change in the process list.
392 *
393 * @param btl (IN) BTL instance
394 * @param proc (IN) Peer process
395 * @param peer (IN) Peer addressing information.
396 * @return Status indicating if cleanup was successful
397 *
398 */
399 extern int mca_btl_smcuda_del_procs(
400 struct mca_btl_base_module_t* btl,
401 size_t nprocs,
402 struct opal_proc_t **procs,
403 struct mca_btl_base_endpoint_t **peers
404 );
405
406
407 /**
408 * Allocate a segment.
409 *
410 * @param btl (IN) BTL module
411 * @param size (IN) Request segment size.
412 */
413 extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
414 struct mca_btl_base_module_t* btl,
415 struct mca_btl_base_endpoint_t* endpoint,
416 uint8_t order,
417 size_t size,
418 uint32_t flags
419 );
420
421 /**
422 * Return a segment allocated by this BTL.
423 *
424 * @param btl (IN) BTL module
425 * @param segment (IN) Allocated segment.
426 */
427 extern int mca_btl_smcuda_free(
428 struct mca_btl_base_module_t* btl,
429 mca_btl_base_descriptor_t* segment
430 );
431
432
433 /**
434 * Pack data
435 *
436 * @param btl (IN) BTL module
437 * @param peer (IN) BTL peer addressing
438 */
439 struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
440 struct mca_btl_base_module_t* btl,
441 struct mca_btl_base_endpoint_t* endpoint,
442 struct opal_convertor_t* convertor,
443 uint8_t order,
444 size_t reserve,
445 size_t* size,
446 uint32_t flags
447 );
448
449
450 /**
451 * Initiate an inlined send to the peer or return a descriptor.
452 *
453 * @param btl (IN) BTL module
454 * @param peer (IN) BTL peer addressing
455 */
456 extern int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
457 struct mca_btl_base_endpoint_t* endpoint,
458 struct opal_convertor_t* convertor,
459 void* header,
460 size_t header_size,
461 size_t payload_size,
462 uint8_t order,
463 uint32_t flags,
464 mca_btl_base_tag_t tag,
465 mca_btl_base_descriptor_t** descriptor );
466
467 /**
468 * Initiate a send to the peer.
469 *
470 * @param btl (IN) BTL module
471 * @param peer (IN) BTL peer addressing
472 */
473 extern int mca_btl_smcuda_send(
474 struct mca_btl_base_module_t* btl,
475 struct mca_btl_base_endpoint_t* endpoint,
476 struct mca_btl_base_descriptor_t* descriptor,
477 mca_btl_base_tag_t tag
478 );
479
480 #if OPAL_CUDA_SUPPORT
481 /**
482 * Remote get using device memory.
483 */
484 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
485 struct mca_btl_base_endpoint_t *ep, void *local_address,
486 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
487 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
488 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
489
490 /* CUDA IPC control message tags */
491 enum ipcCtrlMsg {
492 IPC_REQ = 10,
493 IPC_ACK,
494 IPC_NOTREADY,
495 };
496
497 /* CUDA IPC control message */
498 typedef struct ctrlhdr_st {
499 enum ipcCtrlMsg ctag;
500 int cudev;
501 } ctrlhdr_t;
502
503 /* State of setting up CUDA IPC on an endpoint */
504 enum ipcState {
505 IPC_INIT = 1,
506 IPC_SENT,
507 IPC_ACKING,
508 IPC_ACKED,
509 IPC_OK,
510 IPC_BAD
511 };
512
513 #endif /* OPAL_CUDA_SUPPORT */
514
515
516 extern void mca_btl_smcuda_dump(struct mca_btl_base_module_t* btl,
517 struct mca_btl_base_endpoint_t* endpoint,
518 int verbose);
519
520 /**
521 * Fault Tolerance Event Notification Function
522 * @param state Checkpoint Stae
523 * @return OPAL_SUCCESS or failure status
524 */
525 int mca_btl_smcuda_ft_event(int state);
526
527 #if OPAL_ENABLE_PROGRESS_THREADS == 1
528 void mca_btl_smcuda_component_event_thread(opal_object_t*);
529 #endif
530
531 #if OPAL_ENABLE_PROGRESS_THREADS == 1
532 #define MCA_BTL_SMCUDA_SIGNAL_PEER(peer) \
533 { \
534 unsigned char cmd = DATA; \
535 if(write(peer->fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { \
536 opal_output(0, "mca_btl_smcuda_send: write fifo failed: errno=%d\n", errno); \
537 } \
538 }
539 #else
540 #define MCA_BTL_SMCUDA_SIGNAL_PEER(peer)
541 #endif
542
543 END_C_DECLS
544
545 #endif
546