This source file includes following definitions.
- check_config_value_bool
- ompi_osc_rdma_pvar_read
- ompi_osc_rdma_component_register
- ompi_osc_rdma_component_init
- ompi_osc_rdma_component_finalize
- ompi_osc_rdma_component_query
- ompi_osc_rdma_initialize_region
- allocate_state_single
- allocate_state_shared
- ompi_osc_rdma_query_mtls
- ompi_osc_rdma_query_btls
- ompi_osc_rdma_share_data
- ompi_osc_rdma_create_groups
- ompi_osc_rdma_check_parameters
- ompi_osc_rdma_component_select
- ompi_osc_rdma_set_no_lock_info
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 #include "ompi_config.h"
32
33 #include <string.h>
34
35 #include "osc_rdma.h"
36 #include "osc_rdma_frag.h"
37 #include "osc_rdma_request.h"
38 #include "osc_rdma_active_target.h"
39 #include "osc_rdma_passive_target.h"
40 #include "osc_rdma_comm.h"
41 #include "osc_rdma_dynamic.h"
42 #include "osc_rdma_accumulate.h"
43
44 #include "opal/threads/mutex.h"
45 #include "opal/util/arch.h"
46 #include "opal/util/argv.h"
47 #include "opal/util/printf.h"
48 #include "opal/align.h"
49 #if OPAL_CUDA_SUPPORT
50 #include "opal/datatype/opal_datatype_cuda.h"
51 #endif
52 #include "opal/util/info_subscriber.h"
53
54 #include "ompi/info/info.h"
55 #include "ompi/communicator/communicator.h"
56 #include "ompi/mca/osc/osc.h"
57 #include "ompi/mca/osc/base/base.h"
58 #include "ompi/mca/osc/base/osc_base_obj_convert.h"
59 #include "ompi/mca/pml/pml.h"
60 #include "opal/mca/btl/base/base.h"
61 #include "opal/mca/base/mca_base_pvar.h"
62 #include "ompi/mca/bml/base/base.h"
63 #include "ompi/mca/mtl/base/base.h"
64
65 static int ompi_osc_rdma_component_register (void);
66 static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
67 static int ompi_osc_rdma_component_finalize (void);
68 static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
69 struct ompi_communicator_t *comm, struct opal_info_t *info,
70 int flavor);
71 static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
72 struct ompi_communicator_t *comm, struct opal_info_t *info,
73 int flavor, int *model);
74 #if 0
75 static int ompi_osc_rdma_set_info (struct ompi_win_t *win, struct opal_info_t *info);
76 static int ompi_osc_rdma_get_info (struct ompi_win_t *win, struct opal_info_t **info_used);
77 #endif
78 static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl);
79 static int ompi_osc_rdma_query_mtls (void);
80
81 static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *key, char *value);
82
83 static char *ompi_osc_rdma_btl_names;
84 static char *ompi_osc_rdma_mtl_names;
85
86 static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
87 {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
88 {.value = OMPI_OSC_RDMA_LOCKING_ON_DEMAND, .string = "on_demand"},
89 {.string = NULL},
90 };
91
92 ompi_osc_rdma_component_t mca_osc_rdma_component = {
93 .super = {
94 .osc_version = {
95 OMPI_OSC_BASE_VERSION_3_0_0,
96 .mca_component_name = "rdma",
97 MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
98 OMPI_RELEASE_VERSION),
99 .mca_register_component_params = ompi_osc_rdma_component_register
100 },
101 .osc_data = {
102
103 MCA_BASE_METADATA_PARAM_NONE
104 },
105 .osc_init = ompi_osc_rdma_component_init,
106 .osc_query = ompi_osc_rdma_component_query,
107 .osc_select = ompi_osc_rdma_component_select,
108 .osc_finalize = ompi_osc_rdma_component_finalize
109 }
110 };
111
112 ompi_osc_base_module_t ompi_osc_rdma_module_rdma_template = {
113 .osc_win_attach = ompi_osc_rdma_attach,
114 .osc_win_detach = ompi_osc_rdma_detach,
115 .osc_free = ompi_osc_rdma_free,
116
117 .osc_put = ompi_osc_rdma_put,
118 .osc_get = ompi_osc_rdma_get,
119 .osc_accumulate = ompi_osc_rdma_accumulate,
120 .osc_compare_and_swap = ompi_osc_rdma_compare_and_swap,
121 .osc_fetch_and_op = ompi_osc_rdma_fetch_and_op,
122 .osc_get_accumulate = ompi_osc_rdma_get_accumulate,
123
124 .osc_rput = ompi_osc_rdma_rput,
125 .osc_rget = ompi_osc_rdma_rget,
126 .osc_raccumulate = ompi_osc_rdma_raccumulate,
127 .osc_rget_accumulate = ompi_osc_rdma_rget_accumulate,
128
129 .osc_fence = ompi_osc_rdma_fence_atomic,
130
131 .osc_start = ompi_osc_rdma_start_atomic,
132 .osc_complete = ompi_osc_rdma_complete_atomic,
133 .osc_post = ompi_osc_rdma_post_atomic,
134 .osc_wait = ompi_osc_rdma_wait_atomic,
135 .osc_test = ompi_osc_rdma_test_atomic,
136
137 .osc_lock = ompi_osc_rdma_lock_atomic,
138 .osc_unlock = ompi_osc_rdma_unlock_atomic,
139 .osc_lock_all = ompi_osc_rdma_lock_all_atomic,
140 .osc_unlock_all = ompi_osc_rdma_unlock_all_atomic,
141
142 .osc_sync = ompi_osc_rdma_sync,
143 .osc_flush = ompi_osc_rdma_flush,
144 .osc_flush_all = ompi_osc_rdma_flush_all,
145 .osc_flush_local = ompi_osc_rdma_flush_local,
146 .osc_flush_local_all = ompi_osc_rdma_flush_local_all,
147 };
148
149
150
151
152 static bool check_config_value_bool (char *key, opal_info_t *info)
153 {
154 int ret, flag, param;
155 bool result = false;
156 const bool *flag_value = &result;
157
158 ret = opal_info_get_bool (info, key, &result, &flag);
159 if (OMPI_SUCCESS == ret && flag) {
160 return result;
161 }
162
163 param = mca_base_var_find("ompi", "osc", "rdma", key);
164 if (0 <= param) {
165 (void) mca_base_var_get_value(param, &flag_value, NULL, NULL);
166 }
167
168 return flag_value[0];
169 }
170
171 static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *value, void *obj)
172 {
173 ompi_win_t *win = (ompi_win_t *) obj;
174 ompi_osc_rdma_module_t *module = GET_MODULE(win);
175 int offset = (int) (intptr_t) pvar->ctx;
176
177 memcpy (value, (char *) module + offset, sizeof (unsigned long));
178
179 return OMPI_SUCCESS;
180 }
181
182 static int ompi_osc_rdma_component_register (void)
183 {
184 char *description_str;
185 mca_base_var_enum_t *new_enum;
186
187 mca_osc_rdma_component.no_locks = false;
188 opal_asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is "
189 "not used. Info key of same name overrides this value (default: %s)",
190 mca_osc_rdma_component.no_locks ? "true" : "false");
191 (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "no_locks", description_str,
192 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
193 MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks);
194 free(description_str);
195
196 mca_osc_rdma_component.acc_single_intrinsic = false;
197 opal_asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
198 "that will not use anything more than a single predefined datatype (default: %s)",
199 mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false");
200 (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic",
201 description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
202 MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic);
203 free(description_str);
204
205 mca_osc_rdma_component.acc_use_amo = true;
206 opal_asprintf(&description_str, "Enable the use of network atomic memory operations when using single "
207 "intrinsic optimizations. If not set network compare-and-swap will be "
208 "used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
209 (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", description_str,
210 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP,
211 &mca_osc_rdma_component.acc_use_amo);
212 free(description_str);
213
214 mca_osc_rdma_component.buffer_size = 32768;
215 opal_asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size);
216 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", description_str,
217 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
218 MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
219 free(description_str);
220
221 mca_osc_rdma_component.max_attach = 32;
222 opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
223 "Keep in mind that each attached buffer will use a potentially limited "
224 "resource (default: %d)", mca_osc_rdma_component.max_attach);
225 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
226 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
227 MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
228 free(description_str);
229
230 mca_osc_rdma_component.priority = 101;
231 opal_asprintf(&description_str, "Priority of the osc/rdma component (default: %d)",
232 mca_osc_rdma_component.priority);
233 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str,
234 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
235 MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority);
236 free(description_str);
237
238 (void) mca_base_var_enum_create ("osc_rdma_locking_mode", ompi_osc_rdma_locking_modes, &new_enum);
239
240 mca_osc_rdma_component.locking_mode = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL;
241 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "locking_mode",
242 "Locking mode to use for passive-target synchronization (default: two_level)",
243 MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, OPAL_INFO_LVL_3,
244 MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.locking_mode);
245 OBJ_RELEASE(new_enum);
246
247 ompi_osc_rdma_btl_names = "openib,ugni,uct,ucp";
248 opal_asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying "
249 "connectivity. Do not add a BTL to to this list unless it can reach all "
250 "processes in any communicator used with an MPI window (default: %s)",
251 ompi_osc_rdma_btl_names);
252 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", description_str,
253 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
254 MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
255 free(description_str);
256
257 ompi_osc_rdma_mtl_names = "psm2";
258 opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
259 "osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
260 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", description_str,
261 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
262 MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names);
263 free(description_str);
264
265 if (0 == access ("/dev/shm", W_OK)) {
266 mca_osc_rdma_component.backing_directory = "/dev/shm";
267 } else {
268 mca_osc_rdma_component.backing_directory = ompi_process_info.proc_session_dir;
269 }
270
271 (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "backing_directory",
272 "Directory to place backing files for memory windows. "
273 "This directory should be on a local filesystem such as /tmp or "
274 "/dev/shm (default: (linux) /dev/shm, (others) session directory)",
275 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
276 MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_rdma_component.backing_directory);
277
278
279
280 (void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
281 "Number of times put transaction were retried due to resource limitations",
282 OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
283 NULL, MCA_BASE_VAR_BIND_MPI_WIN, MCA_BASE_PVAR_FLAG_CONTINUOUS,
284 ompi_osc_rdma_pvar_read, NULL, NULL,
285 (void *) (intptr_t) offsetof (ompi_osc_rdma_module_t, put_retry_count));
286
287 (void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "get_retry_count",
288 "Number of times get transaction were retried due to resource limitations",
289 OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
290 NULL, MCA_BASE_VAR_BIND_MPI_WIN, MCA_BASE_PVAR_FLAG_CONTINUOUS,
291 ompi_osc_rdma_pvar_read, NULL, NULL,
292 (void *) (intptr_t) offsetof (ompi_osc_rdma_module_t, get_retry_count));
293
294 return OMPI_SUCCESS;
295 }
296
297 static int ompi_osc_rdma_component_init (bool enable_progress_threads,
298 bool enable_mpi_threads)
299 {
300 int ret;
301
302 OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t);
303 OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t);
304 OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
305 OBJ_CONSTRUCT(&mca_osc_rdma_component.modules, opal_hash_table_t);
306
307 opal_hash_table_init(&mca_osc_rdma_component.modules, 2);
308
309 OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t);
310 ret = opal_free_list_init (&mca_osc_rdma_component.frags,
311 sizeof(ompi_osc_rdma_frag_t), 8,
312 OBJ_CLASS(ompi_osc_rdma_frag_t),
313 mca_osc_rdma_component.buffer_size, 8,
314 4, -1, 4, NULL, 0, NULL, NULL, NULL);
315 if (OPAL_SUCCESS != ret) {
316 opal_output_verbose(1, ompi_osc_base_framework.framework_output,
317 "%s:%d: opal_free_list_init_new failed: %d",
318 __FILE__, __LINE__, ret);
319 return ret;
320 }
321
322 OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, opal_free_list_t);
323 ret = opal_free_list_init (&mca_osc_rdma_component.requests,
324 sizeof(ompi_osc_rdma_request_t), 8,
325 OBJ_CLASS(ompi_osc_rdma_request_t), 0, 0,
326 0, -1, 32, NULL, 0, NULL, NULL, NULL);
327 if (OPAL_SUCCESS != ret) {
328 opal_output_verbose(1, ompi_osc_base_framework.framework_output,
329 "%s:%d: opal_free_list_init failed: %d\n",
330 __FILE__, __LINE__, ret);
331 }
332
333 return ret;
334 }
335
336
337 int ompi_osc_rdma_component_finalize (void)
338 {
339 size_t num_modules;
340
341 if (0 != (num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) {
342 opal_output(ompi_osc_base_framework.framework_output, "WARNING: There were %d Windows created but "
343 "not freed.", (int) num_modules);
344 }
345
346 OBJ_DESTRUCT(&mca_osc_rdma_component.frags);
347 OBJ_DESTRUCT(&mca_osc_rdma_component.modules);
348 OBJ_DESTRUCT(&mca_osc_rdma_component.lock);
349 OBJ_DESTRUCT(&mca_osc_rdma_component.requests);
350 OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc);
351 OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
352
353 return OMPI_SUCCESS;
354 }
355
356
357 static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
358 struct ompi_communicator_t *comm, struct opal_info_t *info,
359 int flavor)
360 {
361
362 if (MPI_WIN_FLAVOR_SHARED == flavor) {
363 return -1;
364 }
365
366 #if OPAL_CUDA_SUPPORT
367
368 if (MPI_WIN_FLAVOR_CREATE == flavor) {
369 if (opal_cuda_check_bufs(*base, NULL)) {
370 return -1;
371 }
372 }
373 #endif
374
375 if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) {
376 return 5;
377 }
378
379 if (OMPI_SUCCESS != ompi_osc_rdma_query_btls (comm, NULL)) {
380 return -1;
381 }
382
383
384 return mca_osc_rdma_component.priority;
385 }
386
387 static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) {
388 ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions;
389 int ret;
390
391
392 module->state->disp_unit = module->disp_unit;
393
394
395 module->state->region_count = 1;
396 region->base = (osc_rdma_base_t) (intptr_t) *base;
397 region->len = size;
398
399 if (module->selected_btl->btl_register_mem && size) {
400 if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
401 ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
402 &module->base_handle);
403 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
404 return OMPI_ERR_OUT_OF_RESOURCE;
405 }
406
407 memcpy (region->btl_handle_data, module->base_handle, module->selected_btl->btl_registration_handle_size);
408 } else {
409 memcpy (region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
410 }
411 }
412
413 return OMPI_SUCCESS;
414 }
415
416 static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
417 {
418 size_t total_size, local_rank_array_size, leader_peer_data_size;
419 ompi_osc_rdma_peer_t *my_peer;
420 int ret, my_rank;
421
422 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state");
423
424 my_rank = ompi_comm_rank (module->comm);
425
426 local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT(module);
427 leader_peer_data_size = module->region_size * module->node_count;
428
429
430
431 total_size = local_rank_array_size + module->region_size +
432 module->state_size + leader_peer_data_size;
433
434 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
435 total_size += size;
436 }
437
438
439
440
441
442 module->rank_array = calloc (total_size, 1);
443 if (OPAL_UNLIKELY(NULL == module->rank_array)) {
444 return OMPI_ERR_OUT_OF_RESOURCE;
445 }
446
447
448
449
450
451 module->state_offset = local_rank_array_size + module->region_size;
452
453 module->state = (ompi_osc_rdma_state_t *) ((intptr_t) module->rank_array + module->state_offset);
454 module->node_comm_info = (unsigned char *) ((intptr_t) module->state + module->state_size);
455
456 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
457 *base = (void *) ((intptr_t) module->node_comm_info + leader_peer_data_size);
458 }
459
460
461 ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->rank_array, total_size,
462 MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
463 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
464 return ret;
465 }
466
467 if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
468 ret = ompi_osc_rdma_initialize_region (module, base, size);
469 if (OMPI_SUCCESS != ret) {
470 return ret;
471 }
472 }
473
474 ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer);
475 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
476 return ret;
477 }
478
479 ret = ompi_osc_module_add_peer (module, my_peer);
480 if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
481 OBJ_RELEASE(my_peer);
482 return ret;
483 }
484
485 module->my_peer = my_peer;
486 module->free_after = module->rank_array;
487 my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
488 my_peer->state = (uint64_t) (uintptr_t) module->state;
489
490 if (module->use_cpu_atomics) {
491
492 my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
493 } else {
494
495 my_peer->state_handle = module->state_handle;
496 my_peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank);
497 }
498
499 if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
500 ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) my_peer;
501
502 ex_peer->super.base = (intptr_t) *base;
503
504 if (!module->same_size) {
505 ex_peer->size = size;
506 }
507
508 if (!module->use_cpu_atomics) {
509 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
510
511 ex_peer->super.base_handle = module->state_handle;
512 } else {
513 ex_peer->super.base_handle = module->base_handle;
514 }
515 }
516 }
517
518 return OMPI_SUCCESS;
519 }
520
521 struct _local_data {
522 int rank;
523 size_t size;
524 };
525
526 static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, size_t size)
527 {
528 ompi_communicator_t *shared_comm;
529 unsigned long offset, total_size;
530 unsigned long state_base, data_base;
531 int local_rank, local_size, ret;
532 size_t local_rank_array_size, leader_peer_data_size, my_base_offset = 0;
533 int my_rank = ompi_comm_rank (module->comm);
534 int global_size = ompi_comm_size (module->comm);
535 ompi_osc_rdma_region_t *state_region;
536 struct _local_data *temp;
537 char *data_file;
538
539 shared_comm = module->shared_comm;
540
541 local_rank = ompi_comm_rank (shared_comm);
542 local_size = ompi_comm_size (shared_comm);
543
544
545 module->use_cpu_atomics = local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
546
547 if (1 == local_size) {
548
549 return allocate_state_single (module, base, size);
550 }
551
552 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating shared internal state");
553
554 local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT (module);
555 leader_peer_data_size = module->region_size * module->node_count;
556
557
558 module->state_offset = state_base = local_rank_array_size + module->region_size;
559 data_base = state_base + leader_peer_data_size + module->state_size * local_size;
560
561 do {
562 temp = calloc (local_size, sizeof (temp[0]));
563 if (NULL == temp) {
564 ret = OMPI_ERR_OUT_OF_RESOURCE;
565 break;
566 }
567
568 temp[local_rank].rank = my_rank;
569 temp[local_rank].size = size;
570
571
572 ret = shared_comm->c_coll->coll_allgather (MPI_IN_PLACE, sizeof (*temp), MPI_BYTE, temp, sizeof (*temp),
573 MPI_BYTE, shared_comm, shared_comm->c_coll->coll_allgather_module);
574 if (OMPI_SUCCESS != ret) {
575 break;
576 }
577
578 total_size = data_base;
579
580 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
581 for (int i = 0 ; i < local_size ; ++i) {
582 if (local_rank == i) {
583 my_base_offset = total_size;
584 }
585 total_size += temp[i].size;
586 }
587 }
588
589 if (0 == local_rank) {
590
591 ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d",
592 mca_osc_rdma_component.backing_directory, ompi_process_info.nodename,
593 OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm));
594 if (0 > ret) {
595 ret = OMPI_ERR_OUT_OF_RESOURCE;
596 break;
597 }
598
599
600 ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
601 free (data_file);
602 if (OPAL_SUCCESS != ret) {
603 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
604 break;
605 }
606 }
607
608 ret = module->comm->c_coll->coll_bcast (&module->seg_ds, sizeof (module->seg_ds), MPI_BYTE, 0,
609 shared_comm, shared_comm->c_coll->coll_bcast_module);
610 if (OMPI_SUCCESS != ret) {
611 break;
612 }
613
614 module->segment_base = opal_shmem_segment_attach (&module->seg_ds);
615 if (NULL == module->segment_base) {
616 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to attach to the shared memory segment");
617 ret = OPAL_ERROR;
618 break;
619 }
620
621 if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
622 *base = (void *)((intptr_t) module->segment_base + my_base_offset);
623 memset (*base, 0, size);
624 }
625
626 module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
627
628 state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size);
629 module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
630
631
632 module->node_comm_info = (unsigned char *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_size);
633
634
635 memset (module->state, 0, module->state_size);
636
637
638 shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
639
640 if (0 == local_rank) {
641
642 opal_shmem_unlink (&module->seg_ds);
643
644 ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
645 &module->state_handle);
646 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
647 break;
648 }
649
650 state_region->base = (intptr_t) module->segment_base;
651 if (module->state_handle) {
652 memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
653 }
654 }
655
656
657 shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
658
659 if (MPI_WIN_FLAVOR_CREATE == module->flavor) {
660 ret = ompi_osc_rdma_initialize_region (module, base, size);
661 if (OMPI_SUCCESS != ret) {
662 break;
663 }
664 }
665
666 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
667 ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions;
668 module->state->disp_unit = module->disp_unit;
669 module->state->region_count = 1;
670 region->base = state_region->base + my_base_offset;
671 region->len = size;
672 if (module->selected_btl->btl_register_mem) {
673 memcpy (region->btl_handle_data, state_region->btl_handle_data, module->selected_btl->btl_registration_handle_size);
674 }
675 }
676
677
678 shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
679
680 offset = data_base;
681 for (int i = 0 ; i < local_size ; ++i) {
682
683 ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
684 ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions;
685 ompi_osc_rdma_peer_extended_t *ex_peer;
686 ompi_osc_rdma_peer_t *peer;
687 int peer_rank = temp[i].rank;
688
689 ret = ompi_osc_rdma_new_peer (module, peer_rank, &peer);
690 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
691 break;
692 }
693
694 ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
695
696
697 if (module->use_cpu_atomics) {
698
699 peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
700 peer->state = (osc_rdma_counter_t) peer_state;
701 peer->state_endpoint = NULL;
702 } else {
703
704 if (module->selected_btl->btl_register_mem) {
705 peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
706 }
707 peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
708 peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank);
709 }
710
711 if (my_rank == peer_rank) {
712 module->my_peer = peer;
713 }
714
715 if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || MPI_WIN_FLAVOR_CREATE == module->flavor) {
716
717 peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
718 } else if (!module->use_cpu_atomics && temp[i].size) {
719
720 peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank);
721 }
722
723 ompi_osc_module_add_peer (module, peer);
724
725 if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
726 if (module->use_cpu_atomics && peer_rank == my_rank) {
727 peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
728 }
729
730 continue;
731 } else if (0 == temp[i].size) {
732
733 continue;
734 }
735
736
737 if (!(module->same_disp_unit && module->same_size)) {
738 ex_peer->disp_unit = peer_state->disp_unit;
739 ex_peer->size = temp[i].size;
740 }
741
742 if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) {
743
744 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
745 ex_peer->super.base = (uintptr_t) module->segment_base + offset;
746 } else {
747 ex_peer->super.base = (uintptr_t) *base;
748 }
749
750 peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
751 offset += temp[i].size;
752 } else {
753 ex_peer->super.base = peer_region->base;
754
755 if (module->selected_btl->btl_register_mem) {
756 ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data;
757 }
758 }
759 }
760 } while (0);
761
762 free (temp);
763
764 return ret;
765 }
766
767 static int ompi_osc_rdma_query_mtls (void)
768 {
769 char **mtls_to_use;
770
771 mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
772 if (mtls_to_use && ompi_mtl_base_selected_component) {
773 for (int i = 0 ; mtls_to_use[i] ; ++i) {
774 if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
775 opal_argv_free(mtls_to_use);
776 return OMPI_SUCCESS;
777 }
778 }
779 }
780 opal_argv_free(mtls_to_use);
781 return -1;
782 }
783
784 static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
785 {
786 struct mca_btl_base_module_t **possible_btls = NULL;
787 int comm_size = ompi_comm_size (comm);
788 int rc = OMPI_SUCCESS, max_btls = 0;
789 unsigned int selected_latency = INT_MAX;
790 struct mca_btl_base_module_t *selected_btl = NULL;
791 mca_btl_base_selected_module_t *item;
792 int *btl_counts = NULL;
793 char **btls_to_use;
794 void *tmp;
795
796 btls_to_use = opal_argv_split (ompi_osc_rdma_btl_names, ',');
797 if (btls_to_use) {
798
799 OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
800 for (int i = 0 ; btls_to_use[i] ; ++i) {
801 if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
802 continue;
803 }
804
805 if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA &&
806 (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) {
807 if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
808 selected_btl = item->btl_module;
809 }
810 }
811 }
812 }
813
814 opal_argv_free (btls_to_use);
815 }
816
817 if (btl) {
818 *btl = selected_btl;
819 }
820
821 if (NULL != selected_btl) {
822 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s",
823 selected_btl->btl_component->btl_version.mca_component_name);
824 return OMPI_SUCCESS;
825 }
826
827 for (int i = 0 ; i < comm_size ; ++i) {
828 ompi_proc_t *proc = ompi_comm_peer_lookup (comm, i);
829 mca_bml_base_endpoint_t *endpoint;
830 int num_btls, prev_max;
831
832 endpoint = mca_bml_base_get_endpoint (proc);
833 if (NULL == endpoint) {
834
835 rc = OMPI_ERR_UNREACH;
836 break;
837 }
838
839 num_btls = mca_bml_base_btl_array_get_size (&endpoint->btl_rdma);
840 if (0 == num_btls) {
841 rc = OMPI_ERR_NOT_AVAILABLE;
842
843 break;
844 }
845
846 prev_max = max_btls;
847
848 max_btls = (max_btls > num_btls) ? max_btls : num_btls;
849
850 tmp = realloc (possible_btls, sizeof (void *) * max_btls);
851 if (NULL == tmp) {
852 rc = OMPI_ERR_OUT_OF_RESOURCE;
853 break;
854 }
855 possible_btls = tmp;
856
857 for (int j = prev_max ; j < max_btls ; ++j) {
858 possible_btls[j] = NULL;
859 }
860
861 tmp = realloc (btl_counts, sizeof (int) * max_btls);
862 if (NULL == tmp) {
863 rc = OMPI_ERR_OUT_OF_RESOURCE;
864 break;
865 }
866 btl_counts = tmp;
867
868 for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
869
870 if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
871 (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags &
872 MCA_BTL_ATOMIC_SUPPORTS_ADD)) {
873 for (int j = 0 ; j < max_btls ; ++j) {
874 if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
875 ++btl_counts[j];
876 break;
877 } else if (NULL == possible_btls[j]) {
878 possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl;
879 btl_counts[j] = 1;
880 break;
881 }
882 }
883 }
884 }
885 }
886
887 if (OMPI_SUCCESS != rc) {
888 free (possible_btls);
889 free (btl_counts);
890
891
892 return OMPI_ERR_NOT_AVAILABLE;
893 }
894
895 for (int i = 0 ; i < max_btls ; ++i) {
896 int btl_count = btl_counts[i];
897
898 if (NULL == possible_btls[i]) {
899 break;
900 }
901
902 if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
903
904 btl_count++;
905 }
906
907 if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) {
908 selected_btl = possible_btls[i];
909 selected_latency = possible_btls[i]->btl_latency;
910 }
911 }
912
913 free (possible_btls);
914 free (btl_counts);
915
916 if (btl) {
917 *btl = selected_btl;
918 }
919
920 if (NULL == selected_btl) {
921 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found");
922
923 return OMPI_ERR_NOT_AVAILABLE;
924 }
925
926 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s",
927 selected_btl->btl_component->btl_version.mca_component_name);
928
929 return OMPI_SUCCESS;
930 }
931
932 static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
933 {
934 ompi_osc_rdma_region_t *my_data;
935 int ret, global_result;
936 int my_rank = ompi_comm_rank (module->comm);
937 int comm_size = ompi_comm_size (module->comm);
938 ompi_osc_rdma_rank_data_t *temp;
939
940 do {
941 temp = malloc (sizeof (*temp) * comm_size);
942 if (NULL == temp) {
943 ret = OMPI_ERR_OUT_OF_RESOURCE;
944 break;
945 }
946
947
948 temp[my_rank].node_id = module->node_id;
949 temp[my_rank].rank = ompi_comm_rank (module->shared_comm);
950
951 ret = module->comm->c_coll->coll_allgather (MPI_IN_PLACE, 1, MPI_2INT, temp, 1, MPI_2INT,
952 module->comm, module->comm->c_coll->coll_allgather_module);
953 if (OMPI_SUCCESS != ret) {
954 break;
955 }
956
957 if (0 == ompi_comm_rank (module->shared_comm)) {
958
959 my_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + ompi_comm_rank (module->local_leaders) *
960 module->region_size);
961
962 my_data->base = (uint64_t) (intptr_t) module->rank_array;
963
964 my_data->len = (osc_rdma_size_t) my_rank;
965
966 if (module->selected_btl->btl_register_mem) {
967 memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
968 }
969
970
971 if (ompi_comm_size (module->local_leaders) > 1) {
972 ret = module->local_leaders->c_coll->coll_allgather (MPI_IN_PLACE, module->region_size, MPI_BYTE, module->node_comm_info,
973 module->region_size, MPI_BYTE, module->local_leaders,
974 module->local_leaders->c_coll->coll_allgather_module);
975 if (OMPI_SUCCESS != ret) {
976 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "leader allgather failed with ompi error code %d", ret);
977 break;
978 }
979 }
980
981 int base_rank = ompi_comm_rank (module->local_leaders) * ((comm_size + module->node_count - 1) / module->node_count);
982
983
984 for (int i = 0 ; i < RANK_ARRAY_COUNT(module) ; ++i) {
985 int save_rank = base_rank + i;
986 if (save_rank >= comm_size) {
987 break;
988 }
989
990 module->rank_array[i] = temp[save_rank];
991 }
992 }
993
994 free (temp);
995 } while (0);
996
997
998 ret = module->comm->c_coll->coll_allreduce (&ret, &global_result, 1, MPI_INT, MPI_MIN, module->comm,
999 module->comm->c_coll->coll_allreduce_module);
1000
1001 if (OMPI_SUCCESS != ret) {
1002 global_result = ret;
1003 }
1004
1005
1006 if (MPI_COMM_NULL != module->local_leaders) {
1007 ompi_comm_free (&module->local_leaders);
1008 }
1009
1010 if (MPI_COMM_NULL != module->shared_comm) {
1011 ompi_comm_free (&module->shared_comm);
1012 }
1013
1014 return global_result;
1015 }
1016
1017 static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module)
1018 {
1019 int comm_rank, ret, local_rank;
1020 int values[2] = {0, 0};
1021
1022
1023 ret = ompi_comm_split_type (module->comm, MPI_COMM_TYPE_SHARED, 0, NULL, &module->shared_comm);
1024 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1025 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create a shared memory communicator. error code %d", ret);
1026 return ret;
1027 }
1028
1029 local_rank = ompi_comm_rank (module->shared_comm);
1030
1031 comm_rank = ompi_comm_rank (module->comm);
1032
1033 ret = ompi_comm_split (module->comm, (0 == local_rank) ? 0 : MPI_UNDEFINED, comm_rank, &module->local_leaders,
1034 false);
1035 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1036 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create local leaders communicator. error code %d", ret);
1037 return ret;
1038 }
1039
1040 if (0 == local_rank) {
1041 values[0] = ompi_comm_size (module->local_leaders);
1042 values[1] = ompi_comm_rank (module->local_leaders);
1043 }
1044
1045 if (ompi_comm_size (module->shared_comm) > 1) {
1046 ret = module->shared_comm->c_coll->coll_bcast (values, 2, MPI_INT, 0, module->shared_comm,
1047 module->shared_comm->c_coll->coll_bcast_module);
1048 if (OMPI_SUCCESS != ret) {
1049 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to broadcast local data. error code %d", ret);
1050 return ret;
1051 }
1052 }
1053
1054 module->node_count = values[0];
1055 module->node_id = values[1];
1056
1057 return OMPI_SUCCESS;
1058 }
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070 static int ompi_osc_rdma_check_parameters (ompi_osc_rdma_module_t *module, int disp_unit, size_t size)
1071 {
1072 long values[4];
1073 int ret;
1074
1075 if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || (module->same_size && module->same_disp_unit)) {
1076
1077 return OMPI_SUCCESS;
1078 }
1079
1080
1081 values[0] = disp_unit;
1082 values[1] = -disp_unit;
1083 values[2] = size;
1084 values[3] = -(ssize_t) size;
1085
1086 ret = module->comm->c_coll->coll_allreduce (MPI_IN_PLACE, values, 4, MPI_LONG, MPI_MIN, module->comm,
1087 module->comm->c_coll->coll_allreduce_module);
1088 if (OMPI_SUCCESS != ret) {
1089 return ret;
1090 }
1091
1092 if (values[0] == -values[1]) {
1093
1094 module->same_disp_unit = true;
1095 }
1096
1097 if (values[2] == -values[3]) {
1098
1099 module->same_size = true;
1100 }
1101
1102 return OMPI_SUCCESS;
1103 }
1104
1105
1106 static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
1107 struct ompi_communicator_t *comm, struct opal_info_t *info,
1108 int flavor, int *model)
1109 {
1110 ompi_osc_rdma_module_t *module = NULL;
1111 int world_size = ompi_comm_size (comm);
1112 int init_limit = 256;
1113 int ret;
1114 char *name;
1115
1116
1117
1118 if (MPI_WIN_FLAVOR_SHARED == flavor) {
1119 return OMPI_ERR_NOT_SUPPORTED;
1120 }
1121
1122
1123 module = (ompi_osc_rdma_module_t *) calloc (1, sizeof (ompi_osc_rdma_module_t));
1124 if (NULL == module) {
1125 return OMPI_ERR_OUT_OF_RESOURCE;
1126 }
1127
1128
1129 OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t);
1130 OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
1131 OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
1132 OBJ_CONSTRUCT(&module->peer_lock, opal_mutex_t);
1133 OBJ_CONSTRUCT(&module->all_sync, ompi_osc_rdma_sync_t);
1134
1135 module->same_disp_unit = check_config_value_bool ("same_disp_unit", info);
1136 module->same_size = check_config_value_bool ("same_size", info);
1137 module->no_locks = check_config_value_bool ("no_locks", info);
1138 module->locking_mode = mca_osc_rdma_component.locking_mode;
1139 module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
1140 module->acc_use_amo = mca_osc_rdma_component.acc_use_amo;
1141
1142 module->all_sync.module = module;
1143
1144 module->flavor = flavor;
1145 module->win = win;
1146 module->disp_unit = disp_unit;
1147 module->size = size;
1148
1149
1150 win->w_osc_module = (ompi_osc_base_module_t*) module;
1151
1152 if (!module->no_locks) {
1153 if (world_size > init_limit) {
1154 ret = opal_hash_table_init (&module->outstanding_locks, init_limit);
1155 if (OPAL_SUCCESS != ret) {
1156 ompi_osc_rdma_free (win);
1157 return ret;
1158 }
1159 } else {
1160 module->outstanding_lock_array = calloc (world_size, sizeof (module->outstanding_lock_array[0]));
1161 if (NULL == module->outstanding_lock_array) {
1162 ompi_osc_rdma_free (win);
1163 return OMPI_ERR_OUT_OF_RESOURCE;
1164 }
1165 }
1166 }
1167
1168 ret = ompi_comm_dup(comm, &module->comm);
1169 if (OMPI_SUCCESS != ret) {
1170 ompi_osc_rdma_free (win);
1171 return ret;
1172 }
1173
1174 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "creating osc/rdma window of flavor %d with id %d",
1175 flavor, ompi_comm_get_cid(module->comm));
1176
1177
1178 if (world_size > init_limit) {
1179 OBJ_CONSTRUCT(&module->peer_hash, opal_hash_table_t);
1180 ret = opal_hash_table_init (&module->peer_hash, init_limit);
1181 } else {
1182 module->peer_array = calloc (world_size, sizeof (ompi_osc_rdma_peer_t *));
1183 if (NULL == module->peer_array) {
1184 ret = OMPI_ERR_OUT_OF_RESOURCE;
1185 }
1186 }
1187
1188 if (OPAL_SUCCESS != ret) {
1189 ompi_osc_rdma_free (win);
1190 return ret;
1191 }
1192
1193
1194 ret = ompi_osc_rdma_query_btls (module->comm, &module->selected_btl);
1195 if (OMPI_SUCCESS != ret) {
1196 ompi_osc_rdma_free (win);
1197 return ret;
1198 }
1199
1200
1201
1202 module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
1203
1204 module->state_size = sizeof (ompi_osc_rdma_state_t);
1205
1206 if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
1207 module->state_size += module->region_size;
1208 } else {
1209 module->state_size += mca_osc_rdma_component.max_attach * module->region_size;
1210 }
1211
1212
1213
1214 opal_infosubscribe_subscribe(&win->super, "no_locks", "false", ompi_osc_rdma_set_no_lock_info);
1215
1216
1217
1218
1219
1220
1221
1222 memcpy(&module->super, &ompi_osc_rdma_module_rdma_template, sizeof(module->super));
1223
1224 ret = ompi_osc_rdma_check_parameters (module, disp_unit, size);
1225 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1226 ompi_osc_rdma_free (win);
1227 return ret;
1228 }
1229
1230 ret = ompi_osc_rdma_create_groups (module);
1231 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1232 ompi_osc_rdma_free (win);
1233 return ret;
1234 }
1235
1236
1237 ret = allocate_state_shared (module, base, size);
1238 if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1239 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate internal state");
1240 ompi_osc_rdma_free (win);
1241 return ret;
1242 }
1243
1244 if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
1245
1246 module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
1247 sizeof (module->dynamic_handles[0]));
1248 if (NULL == module->dynamic_handles) {
1249 ompi_osc_rdma_free (win);
1250 return OMPI_ERR_OUT_OF_RESOURCE;
1251 }
1252 }
1253
1254
1255 if (module->no_locks) {
1256 win->w_flags |= OMPI_WIN_NO_LOCKS;
1257 }
1258
1259 if (module->same_size) {
1260 win->w_flags |= OMPI_WIN_SAME_SIZE;
1261 }
1262
1263 if (module->same_disp_unit) {
1264 win->w_flags |= OMPI_WIN_SAME_DISP;
1265 }
1266
1267
1268 OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
1269 ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules,
1270 ompi_comm_get_cid(module->comm),
1271 module);
1272 OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
1273 if (OMPI_SUCCESS != ret) {
1274 ompi_osc_rdma_free (win);
1275 return ret;
1276 }
1277
1278
1279 *model = MPI_WIN_UNIFIED;
1280 win->w_osc_module = (ompi_osc_base_module_t*) module;
1281 opal_asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm));
1282 ompi_win_set_name(win, name);
1283 free(name);
1284
1285
1286 opal_atomic_mb();
1287
1288 ret = ompi_osc_rdma_share_data (module);
1289 if (OMPI_SUCCESS != ret) {
1290 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers");
1291 ompi_osc_rdma_free (win);
1292 } else {
1293
1294 module->leader = ompi_osc_rdma_module_peer (module, 0);
1295
1296 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d",
1297 ompi_comm_get_cid(module->comm));
1298 }
1299
1300 return ret;
1301 }
1302
1303
1304 static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *key, char *value)
1305 {
1306
1307 struct ompi_win_t *win = (struct ompi_win_t*) obj;
1308 ompi_osc_rdma_module_t *module = GET_MODULE(win);
1309 bool temp;
1310
1311 temp = opal_str_to_bool(value);
1312 if (temp && !module->no_locks) {
1313
1314
1315 OBJ_DESTRUCT(&module->outstanding_locks);
1316 OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
1317
1318 module->no_locks = true;
1319 } else if (!temp && module->no_locks) {
1320 int world_size = ompi_comm_size (module->comm);
1321 int init_limit = world_size > 256 ? 256 : world_size;
1322 int ret;
1323
1324 ret = opal_hash_table_init (&module->outstanding_locks, init_limit);
1325 if (OPAL_SUCCESS != ret) {
1326 module->no_locks = true;
1327 }
1328
1329 module->no_locks = false;
1330 }
1331
1332 module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
1333
1334
1335
1336 return module->no_locks ? "true" : "false";
1337 }