1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006 Sandia National Laboratories. All rights
14 * reserved.
15 * Copyright (c) 2011-2019 Cisco Systems, Inc. All rights reserved
16 * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
17 * reserved.
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 */
24 /**
25 * @file
26 */
27 #ifndef OPAL_BTL_USNIC_H
28 #define OPAL_BTL_USNIC_H
29
30 #include "opal_config.h"
31 #include <sys/types.h>
32
33 #include "opal_stdint.h"
34 #include "opal/util/alfg.h"
35 #include "opal/class/opal_hash_table.h"
36 #include "opal/class/opal_hash_table.h"
37 #include "opal/mca/event/event.h"
38
39 #include "opal/mca/btl/btl.h"
40 #include "opal/mca/btl/base/btl_base_error.h"
41 #include "opal/mca/btl/base/base.h"
42 #include "opal/mca/rcache/rcache.h"
43
44 #include "btl_usnic_compat.h"
45
46 BEGIN_C_DECLS
47
48 /*
49 * We're simulating a clock as best we can without resorting to the
50 * system. The clock is used to defer ACKs, and ticks will be incremented
51 * when progression gets called. It could be incremented by different amounts
52 * at other times as needed or as tuning dictates.
53 */
54 extern uint64_t opal_btl_usnic_ticks;
55
56 /* Lock for MPU_THREAD_MULTIPLE support */
57 extern opal_recursive_mutex_t btl_usnic_lock;
58
59 static inline uint64_t
60 get_nsec(void)
61 {
62 return opal_btl_usnic_ticks;
63 }
64
65 /* RNG buffer declaration */
66 extern opal_rng_buff_t opal_btl_usnic_rand_buff;
67
68 #ifndef container_of
69 #define container_of(ptr, type, member) ( \
70 (type *)( ((char *)(ptr)) - offsetof(type,member) ))
71 #endif
72
73 #ifndef max
74 #define max(a, b) (((a) > (b)) ? (a) : (b))
75 #endif
76
77 /* MSGDEBUG2 prints 1 line at each BTL entry point */
78 #define MSGDEBUG2 (MSGDEBUG1||0)
79 /* MSGDEBUG1 prints more info about arguments and internal functions */
80 #define MSGDEBUG1 0
81
82 /* output macros to declutter source */
83 #if MSGDEBUG1
84 #define MSGDEBUG1_OUT(...) opal_output(0, __VA_ARGS__)
85 #else
86 #define MSGDEBUG1_OUT(...) do {} while (0)
87 #endif
88 #if MSGDEBUG2
89 #define MSGDEBUG2_OUT(...) opal_output(0, __VA_ARGS__)
90 #else
91 #define MSGDEBUG2_OUT(...) do {} while (0)
92 #endif
93
94 /* Set to >0 to randomly drop received frags. The higher the number,
95 the more frequent the drops. */
96 #define WANT_RECV_DROPS 0
97 /* Set to >0 to randomly fail to send an ACK, mimicing a lost ACK.
98 The higher the number, the more frequent the failed-to-send-ACK. */
99 #define WANT_FAIL_TO_SEND_ACK 0
100 /* Set to >0 to randomly fail to resend a frag (causing it to be
101 requed to be sent later). The higher the number, the more frequent
102 the failed-to-resend-frag. */
103 #define WANT_FAIL_TO_RESEND_FRAG 0
104
105 #if WANT_RECV_DROPS > 0
106 #define FAKE_RECV_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_DROPS)
107 #else
108 #define FAKE_RECV_DROP 0
109 #endif
110
111 #if WANT_FAIL_TO_SEND_ACK > 0
112 #define FAKE_FAIL_TO_SEND_ACK (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_SEND_ACK)
113 #else
114 #define FAKE_FAIL_TO_SEND_ACK 0
115 #endif
116
117 #if WANT_FAIL_TO_RESEND_FRAG > 0
118 #define FAKE_FAIL_TO_RESEND_FRAG (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_RESEND_FRAG)
119 #else
120 #define FAKE_FAIL_TO_RESEND_FRAG 0
121 #endif
122
123
124 /**
125 * usnic BTL component
126 */
127 typedef struct opal_btl_usnic_component_t {
128 /** base BTL component */
129 mca_btl_base_component_2_0_0_t super;
130
131 /* in the v1.6 series, sizeof(super) is 256, leading to good alignment for
132 * subsequent fastpath fields */
133
134 /** Maximum number of BTL modules */
135 int max_modules;
136 /** Number of available/initialized BTL modules */
137 int num_modules;
138
139 /* Cached hashed version of my RTE proc name (to stuff in
140 protocol headers) */
141 uint64_t my_hashed_rte_name;
142
143 /** array of possible BTLs (>= num_modules elements) */
144 struct opal_btl_usnic_module_t* usnic_all_modules;
145 /** array of pointers to active BTLs (num_modules elements) */
146 struct opal_btl_usnic_module_t** usnic_active_modules;
147
148 /** convertor packing threshold */
149 int pack_lazy_threshold;
150
151 /* vvvvvvvvvv non-fastpath fields go below vvvvvvvvvv */
152
153 /** list of usnic proc structures */
154 opal_list_t usnic_procs;
155
156 /** memory pool hints */
157 char* usnic_mpool_hints;
158
159 /** registration cache name */
160 char *usnic_rcache_name;
161
162 char *if_include;
163 char *if_exclude;
164
165 /** Want stats? */
166 bool stats_enabled;
167 bool stats_relative;
168 int stats_frequency;
169
170 /** Whether we want to use NUMA distances to choose which usNIC
171 devices to use for short messages */
172 bool want_numa_device_assignment;
173
174 /** max send descriptors to post per module */
175 int32_t sd_num;
176
177 /** max receive descriptors per module */
178 int32_t rd_num;
179
180 /** max send/receive desriptors for priority channel */
181 int32_t prio_sd_num;
182 int32_t prio_rd_num;
183
184 /** max completion queue entries per module */
185 int32_t cq_num;
186
187 /** max number of entries in AV EQ */
188 int32_t av_eq_num;
189
190 /** retrans characteristics */
191 int retrans_timeout;
192
193 /** transport header length for all usNIC devices on this server
194 (it is guaranteed that all usNIC devices on a single server
195 will have the same underlying transport, and therefore the
196 same transport header length) */
197 int transport_header_len;
198 uint32_t transport_protocol;
199
200 /* what UDP port do we want to use? If 0, the system will pick.
201 If nonzero, it is used as the base -- the final number will be
202 (base+my_local_rank). */
203 int udp_port_base;
204
205 /** disable the "cannot find route" warnings (for network setups
206 where this is known/acceptable) */
207 bool show_route_failures;
208
209 /** connectivity verification: ACK timeout, number of retries
210 before issue an error/abort the job */
211 bool connectivity_enabled;
212 int connectivity_ack_timeout;
213 int connectivity_num_retries;
214
215 /** how many short packets have to be received before outputting
216 the "received short packets" warning? */
217 uint32_t max_short_packets;
218
219 /* Prefix for the connectivity map filename (map will be output if
220 the prefix is non-NULL) */
221 char *connectivity_map_prefix;
222
223 /** Offset into the send buffer where the payload will go. For
224 libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
225 / API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
226 component.transport_header_len). */
227 uint32_t prefix_send_offset;
228
229 /* OPAL async progress event base */
230 opal_event_base_t *opal_evbase;
231 } opal_btl_usnic_component_t;
232
233 OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;
234
235 typedef mca_btl_base_recv_reg_t opal_btl_usnic_recv_reg_t;
236
237 /**
238 * Size for sequence numbers (just to ensure we use the same size
239 * everywhere)
240 */
241 typedef uint16_t opal_btl_usnic_seq_t;
242 #define UDSEQ PRIu16
243
244 /* sequence number comparison macros that allow for rollover.
245 * Relies on the fact that sequence numbers should be relatively close
246 * together as compared to (1<<31)
247 */
248 #define SEQ_DIFF(A,B) ((int16_t)((A)-(B)))
249 #define SEQ_LT(A,B) (SEQ_DIFF(A,B) < 0)
250 #define SEQ_LE(A,B) (SEQ_DIFF(A,B) <= 0)
251 #define SEQ_GT(A,B) (SEQ_DIFF(A,B) > 0)
252 #define SEQ_GE(A,B) (SEQ_DIFF(A,B) >= 0)
253
254 /**
255 * Register the usnic BTL MCA params
256 */
257 int opal_btl_usnic_component_register(void);
258
259 /**
260 * Routine which can be called from a debugger to print module, endpoint,
261 * fragment, and segment state to standard output. */
262 void opal_btl_usnic_component_debug(void);
263
264 /**
265 * Called to output the connectivity map
266 */
267 void opal_btl_usnic_connectivity_map(void);
268
269 END_C_DECLS
270 #endif