This source file includes following definitions.
- mca_btl_ugni_get_stat
- mca_btl_ugni_notify_stat
- btl_ugni_component_register
- btl_ugni_component_open
- btl_ugni_component_close
- mca_btl_ugni_component_init
- mca_btl_ugni_progress_datagram
- mca_btl_ugni_handle_rdma_completions
- mca_btl_ugni_progress_rdma
- mca_btl_ugni_progress_wait_list
- mca_btl_ugni_component_progress
- mca_btl_ugni_flush
- btl_ugni_dump_post_desc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 #include "btl_ugni.h"
16 #include "btl_ugni_frag.h"
17 #include "btl_ugni_rdma.h"
18 #include "btl_ugni_smsg.h"
19
20 #include "opal/util/sys_limits.h"
21 #include "opal/util/printf.h"
22
23 #include <stdlib.h>
24 #include <fcntl.h>
25 #include <ctype.h>
26
27 #include "opal/memoryhooks/memory.h"
28 #include "opal/runtime/opal_params.h"
29
30 #include "opal/mca/base/mca_base_pvar.h"
31
32 static int btl_ugni_component_register(void);
33 static int btl_ugni_component_open(void);
34 static int btl_ugni_component_close(void);
35 static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
36 static int mca_btl_ugni_component_progress(void);
37 static unsigned long mca_btl_ugni_ugni_page_size = 0;
38
39 mca_btl_ugni_component_t mca_btl_ugni_component = {
40 .super = {
41
42
43 .btl_version = {
44 MCA_BTL_DEFAULT_VERSION("ugni"),
45 .mca_open_component = btl_ugni_component_open,
46 .mca_close_component = btl_ugni_component_close,
47 .mca_register_component_params = btl_ugni_component_register,
48 },
49 .btl_data = {
50 .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
51 },
52 .btl_init = mca_btl_ugni_component_init,
53 .btl_progress = mca_btl_ugni_component_progress,
54 }
55 };
56
57 mca_base_var_enum_value_t rcache_values[] = {
58 {MCA_BTL_UGNI_RCACHE_UDREG, "udreg"},
59 {MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"},
60 {-1, NULL}
61 };
62
63 mca_base_var_enum_value_flag_t cdm_flags[] = {
64 {.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
65 {.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY},
66 {.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY},
67 {.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL},
68 {.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL},
69 {.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0},
70 {.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0},
71 {.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0},
72 {.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED},
73 {.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED},
74 {.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED},
75 {.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED},
76 {.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0},
77 {.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0},
78 {.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0},
79 {.string = NULL}
80 };
81
82 static inline int mca_btl_ugni_get_stat (const mca_base_pvar_t *pvar, void *value, void *obj)
83 {
84 gni_statistic_t statistic = (gni_statistic_t) (intptr_t) pvar->ctx;
85 gni_return_t rc = GNI_RC_SUCCESS;
86
87 for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
88 rc = GNI_GetNicStat (mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic,
89 ((unsigned int *) value) + i);
90 }
91
92 return mca_btl_rc_ugni_to_opal (rc);
93 }
94
95 static inline int mca_btl_ugni_notify_stat (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count)
96 {
97 if (MCA_BASE_PVAR_HANDLE_BIND == event) {
98
99 *count = mca_btl_ugni_component.virtual_device_count;
100 }
101
102 return OPAL_SUCCESS;
103 }
104
105 static int btl_ugni_component_register(void)
106 {
107 mca_base_var_enum_t *new_enum;
108 gni_nic_device_t device_type;
109 char *mpool_hints_tmp = NULL;
110 int rc;
111
112 (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
113 "uGNI byte transport layer");
114
115 mca_btl_ugni_component.ugni_free_list_num = 8;
116 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
117 "free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
118 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
119 OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
120 &mca_btl_ugni_component.ugni_free_list_num);
121 mca_btl_ugni_component.ugni_free_list_max = 4096;
122 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
123 "free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
124 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
125 OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
126 &mca_btl_ugni_component.ugni_free_list_max);
127 mca_btl_ugni_component.ugni_free_list_inc = 64;
128 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
129 "free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
130 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
131 OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
132 &mca_btl_ugni_component.ugni_free_list_inc);
133
134 mca_btl_ugni_component.ugni_eager_num = 16;
135 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
136 "eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
137 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
138 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
139 &mca_btl_ugni_component.ugni_eager_num);
140 mca_btl_ugni_component.ugni_eager_max = 128;
141 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
142 "eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
143 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
144 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
145 &mca_btl_ugni_component.ugni_eager_max);
146 mca_btl_ugni_component.ugni_eager_inc = 16;
147 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
148 "eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
149 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
150 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
151 &mca_btl_ugni_component.ugni_eager_inc);
152
153 mca_btl_ugni_component.remote_cq_size = 40000;
154 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
155 "remote_cq_size", "Remote SMSG completion queue "
156 "size (default 40000)", MCA_BASE_VAR_TYPE_INT,
157 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
158 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
159 &mca_btl_ugni_component.remote_cq_size);
160
161 mca_btl_ugni_component.local_cq_size = 8192;
162 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
163 "local_cq_size", "Local SMSG completion queue size "
164 "(default 8k)", MCA_BASE_VAR_TYPE_INT,
165 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
166 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
167 &mca_btl_ugni_component.local_cq_size);
168
169 mca_btl_ugni_component.local_rdma_cq_size = 1024;
170 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
171 "local_rdma_cq_size", "Local FMA/RDMA completion queue size "
172 "(default: 1024)",MCA_BASE_VAR_TYPE_INT, NULL, 0,
173 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
174 MCA_BASE_VAR_SCOPE_LOCAL,
175 &mca_btl_ugni_component.local_rdma_cq_size);
176
177 mca_btl_ugni_component.ugni_smsg_limit = 0;
178 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
179 "smsg_limit", "Maximum size message that "
180 "will be sent using the SMSG/MSGQ protocol "
181 "(0 - autoselect(default), 16k max)",
182 MCA_BASE_VAR_TYPE_INT, NULL, 0,
183 MCA_BASE_VAR_FLAG_SETTABLE,
184 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
185 &mca_btl_ugni_component.ugni_smsg_limit);
186
187 mca_btl_ugni_component.smsg_max_credits = 32;
188 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
189 "smsg_max_credits", "Maximum number of "
190 "outstanding SMSG/MSGQ message (default 32)",
191 MCA_BASE_VAR_TYPE_INT, NULL, 0,
192 MCA_BASE_VAR_FLAG_SETTABLE,
193 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
194 &mca_btl_ugni_component.smsg_max_credits);
195
196 #if OPAL_C_HAVE__THREAD_LOCAL
197 mca_btl_ugni_component.bind_threads_to_devices = true;
198
199 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
200 "bind_devices", "Bind threads to virtual "
201 "devices. In general this should improve "
202 "RDMA performance (default: true)",
203 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
204 MCA_BASE_VAR_FLAG_SETTABLE,
205 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
206 &mca_btl_ugni_component.bind_threads_to_devices);
207 #endif
208
209 mca_btl_ugni_component.ugni_fma_limit = -1;
210 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
211 "fma_limit", "Default maximum size message that "
212 "will be sent using the FMA (Fast Memory "
213 "Access) protocol (default: -1 (don't use), 64k max)",
214 MCA_BASE_VAR_TYPE_LONG, NULL, 0,
215 MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED,
216 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
217 &mca_btl_ugni_component.ugni_fma_limit);
218
219 mca_btl_ugni_component.ugni_fma_get_limit = 2048;
220 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
221 "fma_get_limit", "Maximum size message that "
222 "will be sent using the FMA (Fast Memory "
223 "Access) protocol for get (default 2k, "
224 "64k max)",
225 MCA_BASE_VAR_TYPE_LONG, NULL, 0,
226 MCA_BASE_VAR_FLAG_SETTABLE,
227 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
228 &mca_btl_ugni_component.ugni_fma_get_limit);
229
230 mca_btl_ugni_component.ugni_fma_put_limit = 4096;
231 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
232 "fma_put_limit", "Maximum size message that "
233 "will be sent using the FMA (Fast Memory "
234 "Access) protocol for put (default: 4k, "
235 "64k max)",
236 MCA_BASE_VAR_TYPE_LONG, NULL, 0,
237 MCA_BASE_VAR_FLAG_SETTABLE,
238 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
239 &mca_btl_ugni_component.ugni_fma_put_limit);
240
241 mca_btl_ugni_component.rdma_max_retries = 16;
242 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
243 "rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
244 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
245 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
246 &mca_btl_ugni_component.rdma_max_retries);
247
248 mca_btl_ugni_component.smsg_max_retries = 16;
249 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
250 "smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
251 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
252 OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
253 &mca_btl_ugni_component.smsg_max_retries);
254
255 mca_btl_ugni_component.max_mem_reg = 0;
256 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
257 "max_mem_reg", "Maximum number of "
258 "memory registrations a process can "
259 "hold (0 - autoselect, -1 - unlimited)"
260 " (default 0)", MCA_BASE_VAR_TYPE_INT,
261 NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
262 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
263 &mca_btl_ugni_component.max_mem_reg);
264
265 mca_btl_ugni_component.mbox_increment = 0;
266 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
267 "mbox_inc", "Number of SMSG mailboxes to "
268 "allocate in each block (0 - autoselect(default))",
269 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
270 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
271 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
272
273
274 rc = mca_base_var_enum_create_flag ("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum);
275 if (OPAL_SUCCESS != rc) {
276 return rc;
277 }
278
279 mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
280 GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
281 mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
282 "cdm_flags", "Flags to set when creating a communication domain "
283 " (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
284 "fma-shared,fma-small-window)",
285 MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
286 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
287 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags);
288 OBJ_RELEASE(new_enum);
289
290 mca_btl_ugni_component.virtual_device_count = 0;
291 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
292 "virtual_device_count", "Number of virtual devices to create. Higher numbers may "
293 "result in better performance when using threads. (default: 0 (auto), max: 128)",
294 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
295 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
296 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
297
298
299 GNI_GetDeviceType (&device_type);
300
301
302 mca_btl_ugni_component.smsg_page_size = 2 << 20;
303 if (GNI_DEVICE_GEMINI == device_type) {
304 if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) {
305 int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY);
306 char buffer[10];
307
308 if (0 <= fd) {
309 memset (buffer, 0, sizeof (buffer));
310 read (fd, buffer, sizeof (buffer) - 1);
311 close (fd);
312 mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024;
313 mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
314 }
315 }
316 }
317
318 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
319 "smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)",
320 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
321 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size);
322
323 mca_btl_ugni_component.progress_thread_requested = 0;
324 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
325 "request_progress_thread",
326 "Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support",
327 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
328 MCA_BASE_VAR_FLAG_SETTABLE,
329 OPAL_INFO_LVL_3,
330 MCA_BASE_VAR_SCOPE_LOCAL,
331 &mca_btl_ugni_component.progress_thread_requested);
332
333
334 mca_btl_ugni_progress_thread_wakeups = 0;
335 (void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version,
336 "progress_thread_wakeups", "Number of times the progress thread "
337 "has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
338 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
339 MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL,
340 NULL, NULL, &mca_btl_ugni_progress_thread_wakeups);
341
342
343 for (int i = 0 ; i < GNI_NUM_STATS ; ++i) {
344 char name[128], desc[128];
345 size_t str_len = strlen (gni_statistic_str[i]);
346
347 assert (str_len < sizeof (name));
348
349
350
351 for (size_t j = 0 ; j < str_len ; ++j) {
352 name[j] = tolower (gni_statistic_str[i][j]);
353 desc[j] = ('_' == name[j]) ? ' ' : name[j];
354 }
355
356 name[str_len] = '\0';
357 desc[str_len] = '\0';
358
359 (void) mca_base_component_pvar_register (&mca_btl_ugni_component.super.btl_version, name, desc,
360 OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER,
361 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
362 MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
363 mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat,
364 (void *) (intptr_t) i);
365 }
366
367
368
369 rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum);
370 if (OPAL_SUCCESS != rc) {
371 return rc;
372 }
373
374
375 mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA;
376 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
377 "rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum,
378 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
379 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type);
380 OBJ_RELEASE(new_enum);
381
382 if (mca_btl_ugni_ugni_page_size) {
383 rc = opal_asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size);
384 if (rc < 0) {
385 return OPAL_ERR_OUT_OF_RESOURCE;
386 }
387
388 mca_btl_ugni_component.mpool_hints = mpool_hints_tmp;
389 } else {
390 mca_btl_ugni_component.mpool_hints = "page_size=2M";
391 }
392
393 (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
394 "mpool_hints", "hints to use when selecting a memory pool (default: "
395 "\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0,
396 MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
397 MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints);
398 free (mpool_hints_tmp);
399
400
401 mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2;
402
403
404 mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024;
405 mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024;
406 mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
407 mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
408 mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
409
410 mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
411
412
413
414
415
416 mca_btl_ugni_module.super.btl_get_alignment = 4;
417
418
419 mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
420
421 mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
422 MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
423 MCA_BTL_FLAGS_ATOMIC_FOPS;
424 mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
425 MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
426 MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
427
428 if (GNI_DEVICE_ARIES == device_type) {
429
430 mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX |
431 MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR |
432 MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT;
433 }
434
435 mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
436
437 mca_btl_ugni_module.super.btl_bandwidth = 40000;
438 mca_btl_ugni_module.super.btl_latency = 2;
439
440 mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
441 mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
442
443
444 mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
445 &mca_btl_ugni_module.super);
446
447 return OPAL_SUCCESS;
448 }
449
450 static int
451 btl_ugni_component_open(void)
452 {
453 mca_btl_ugni_component.ugni_num_btls = 0;
454 mca_btl_ugni_component.modules = NULL;
455
456 return OPAL_SUCCESS;
457 }
458
459
460
461
462 static int
463 btl_ugni_component_close(void)
464 {
465 mca_btl_ugni_fini ();
466
467 free (mca_btl_ugni_component.modules);
468 mca_btl_ugni_component.modules = NULL;
469
470 return OPAL_SUCCESS;
471 }
472
473 static mca_btl_base_module_t **
474 mca_btl_ugni_component_init (int *num_btl_modules,
475 bool enable_progress_threads,
476 bool enable_mpi_threads)
477 {
478 struct mca_btl_base_module_t **base_modules;
479 mca_btl_ugni_module_t *ugni_modules;
480 int rc;
481
482 if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
483 mca_btl_ugni_component.ugni_smsg_limit = 16384;
484 }
485
486 if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
487 mca_btl_ugni_component.ugni_fma_limit = 65536;
488 }
489
490 if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
491 mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit;
492 } else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) {
493 mca_btl_ugni_component.ugni_fma_get_limit = 65536;
494 }
495
496 if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
497 mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit;
498 } else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) {
499 mca_btl_ugni_component.ugni_fma_put_limit = 65536;
500 }
501
502 mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
503
504
505 mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size;
506
507 if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
508 mca_btl_ugni_component.progress_thread_enabled = 1;
509 }
510
511
512 rc = mca_btl_ugni_init();
513 if (OPAL_SUCCESS != rc) {
514 return NULL;
515 }
516
517
518 mca_btl_ugni_component.ugni_num_btls = 1;
519
520 BTL_VERBOSE(("btl/ugni initializing"));
521
522 ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
523 calloc (mca_btl_ugni_component.ugni_num_btls, sizeof (mca_btl_ugni_module_t));
524
525 if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
526 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
527 return NULL;
528 }
529
530 base_modules = (struct mca_btl_base_module_t **)
531 calloc (mca_btl_ugni_component.ugni_num_btls,
532 sizeof (struct mca_btl_base_module_t *));
533 if (OPAL_UNLIKELY(NULL == base_modules)) {
534 BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
535 return NULL;
536 }
537
538 if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) {
539 if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) {
540 mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size;
541 }
542 }
543
544 mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
545
546 rc = mca_btl_ugni_module_init (ugni_modules);
547 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
548 BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
549 __LINE__));
550 return NULL;
551 }
552
553 *base_modules = (mca_btl_base_module_t *) ugni_modules;
554
555 *num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
556
557 BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
558
559 return base_modules;
560 }
561
562 int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
563 {
564 mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
565 mca_btl_base_endpoint_t *ep = NULL;
566 gni_ep_handle_t handle;
567 int count = 0, rc;
568
569 rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
570 if (1 != rc) {
571 return rc;
572 }
573
574 BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));
575
576
577 if (handle == ugni_module->wildcard_ep) {
578 struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
579
580 BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
581 OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
582
583 ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
584 if (OPAL_UNLIKELY(NULL == ep)) {
585
586 BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
587 OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
588 abort ();
589 return OPAL_ERR_NOT_FOUND;
590 }
591 }
592
593
594 assert (NULL != ep);
595
596 BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));
597
598
599 opal_mutex_lock (&ep->lock);
600 if (handle != ugni_module->wildcard_ep) {
601
602 BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
603
604 ep->dg_posted = false;
605 (void) opal_atomic_add_fetch_32 (&ugni_module->active_datagrams, -1);
606 }
607
608 (void) mca_btl_ugni_ep_connect_progress (ep);
609 opal_mutex_unlock (&ep->lock);
610
611 if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
612
613 count = mca_btl_ugni_smsg_process (ep);
614 }
615
616
617 if (handle == ugni_module->wildcard_ep) {
618 mca_btl_ugni_wildcard_ep_post (ugni_module);
619 }
620
621 return count;
622 }
623
624 void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
625 struct mca_btl_ugni_post_descriptor_t *post_desc, const int count)
626 {
627 int bte_complete = 0;
628
629 for (int i = 0 ; i < count ; ++i) {
630 BTL_VERBOSE(("post descriptor complete. status: %d", post_desc[i].rc));
631
632 if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) {
633
634 btl_ugni_dump_post_desc (post_desc + i);
635 }
636
637 bte_complete += post_desc[i].use_bte == true;
638
639 mca_btl_ugni_post_desc_complete (ugni_module, post_desc + i, post_desc[i].rc);
640 }
641
642 if (bte_complete > 0) {
643 (void) OPAL_THREAD_FETCH_ADD32 (&ugni_module->active_rdma_count, -bte_complete);
644 }
645 }
646
647 static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
648 mca_btl_ugni_cq_t *cq)
649 {
650 mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
651 int rc;
652
653 rc = mca_btl_ugni_cq_get_completed_desc (device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
654 if (0 >= rc) {
655 return rc;
656 }
657
658 BTL_VERBOSE(("got %d completed rdma descriptors", rc));
659
660 mca_btl_ugni_handle_rdma_completions (ugni_module, device, post_desc, rc);
661
662 return rc;
663 }
664
665 static inline int
666 mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
667 {
668 int rc = OPAL_SUCCESS;
669 mca_btl_base_endpoint_t *endpoint = NULL;
670 int count;
671
672 if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
673 return 0;
674 }
675
676
677 count = opal_list_get_size(&ugni_module->ep_wait_list);
678 if (0 == count) {
679 return 0;
680 }
681
682 OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
683 count = opal_list_get_size(&ugni_module->ep_wait_list);
684 do {
685 endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
686 if (endpoint != NULL) {
687 rc = mca_btl_ugni_progress_send_wait_list (endpoint);
688
689 if (OPAL_SUCCESS != rc) {
690 opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
691 } else {
692 endpoint->wait_listed = false;
693 }
694 }
695 } while (endpoint != NULL && --count > 0) ;
696 OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
697
698 return rc;
699 }
700
701 static int mca_btl_ugni_component_progress (void)
702 {
703 mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
704 int count = 0;
705
706 count += mca_btl_ugni_progress_remote_smsg (ugni_module);
707
708 if (ugni_module->active_datagrams) {
709 count += mca_btl_ugni_progress_datagram (ugni_module->devices);
710 }
711
712 for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
713 mca_btl_ugni_device_t *device = ugni_module->devices + i;
714
715 if (device->smsg_connections) {
716 count += mca_btl_ugni_progress_local_smsg (ugni_module, device);
717 mca_btl_ugni_progress_wait_list (ugni_module);
718 }
719
720 if (device->dev_rdma_local_cq.active_operations) {
721 count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
722 }
723
724 if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) {
725 count += mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_irq_cq);
726 }
727 }
728
729 return count;
730 }
731
732 int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint)
733 {
734 mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
735
736 for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
737 mca_btl_ugni_device_t *device = ugni_module->devices + i;
738
739
740
741
742 while (device->dev_rdma_local_cq.active_operations) {
743 (void) mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
744 }
745
746
747 device->flushed = true;
748 }
749
750 return OPAL_SUCCESS;
751 }
752
753 void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
754 {
755
756 fprintf (stderr, "desc->gni_desc.post_id = %" PRIx64 "\n", desc->gni_desc.post_id);
757 fprintf (stderr, "desc->gni_desc.status = %" PRIx64 "\n", desc->gni_desc.status);
758 fprintf (stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete);
759 fprintf (stderr, "desc->gni_desc.type = %d\n", desc->gni_desc.type);
760 fprintf (stderr, "desc->gni_desc.cq_mode = %hu\n", desc->gni_desc.cq_mode);
761 fprintf (stderr, "desc->gni_desc.dlvr_mode = %hu\n", desc->gni_desc.dlvr_mode);
762 fprintf (stderr, "desc->gni_desc.local_addr = %" PRIx64 "\n", desc->gni_desc.local_addr);
763 fprintf (stderr, "desc->gni_desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1,
764 desc->gni_desc.local_mem_hndl.qword2);
765 fprintf (stderr, "desc->gni_desc.remote_addr = %" PRIx64 "\n", desc->gni_desc.remote_addr);
766 fprintf (stderr, "desc->gni_desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1,
767 desc->gni_desc.remote_mem_hndl.qword2);
768 fprintf (stderr, "desc->gni_desc.length = %" PRIu64 "\n", desc->gni_desc.length);
769 fprintf (stderr, "desc->gni_desc.rdma_mode = %hu\n", desc->gni_desc.rdma_mode);
770 fprintf (stderr, "desc->gni_desc.amo_cmd = %d\n", desc->gni_desc.amo_cmd);
771 }