root/ompi/mca/osc/rdma/osc_rdma_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. check_config_value_bool
  2. ompi_osc_rdma_pvar_read
  3. ompi_osc_rdma_component_register
  4. ompi_osc_rdma_component_init
  5. ompi_osc_rdma_component_finalize
  6. ompi_osc_rdma_component_query
  7. ompi_osc_rdma_initialize_region
  8. allocate_state_single
  9. allocate_state_shared
  10. ompi_osc_rdma_query_mtls
  11. ompi_osc_rdma_query_btls
  12. ompi_osc_rdma_share_data
  13. ompi_osc_rdma_create_groups
  14. ompi_osc_rdma_check_parameters
  15. ompi_osc_rdma_component_select
  16. ompi_osc_rdma_set_no_lock_info

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University.
   4  *                         All rights reserved.
   5  * Copyright (c) 2004-2017 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2007-2018 Los Alamos National Security, LLC.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2006-2008 University of Houston.  All rights reserved.
  15  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
  16  * Copyright (c) 2012-2015 Sandia National Laboratories.  All rights reserved.
  17  * Copyright (c) 2015      NVIDIA Corporation.  All rights reserved.
  18  * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
  19  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
  20  * Copyright (c) 2018      Cisco Systems, Inc.  All rights reserved
  21  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  22  * Copyright (c) 2019      Research Organization for Information Science
  23  *                         and Technology (RIST).  All rights reserved.
  24  * $COPYRIGHT$
  25  *
  26  * Additional copyrights may follow
  27  *
  28  * $HEADER$
  29  */
  30 
  31 #include "ompi_config.h"
  32 
  33 #include <string.h>
  34 
  35 #include "osc_rdma.h"
  36 #include "osc_rdma_frag.h"
  37 #include "osc_rdma_request.h"
  38 #include "osc_rdma_active_target.h"
  39 #include "osc_rdma_passive_target.h"
  40 #include "osc_rdma_comm.h"
  41 #include "osc_rdma_dynamic.h"
  42 #include "osc_rdma_accumulate.h"
  43 
  44 #include "opal/threads/mutex.h"
  45 #include "opal/util/arch.h"
  46 #include "opal/util/argv.h"
  47 #include "opal/util/printf.h"
  48 #include "opal/align.h"
  49 #if OPAL_CUDA_SUPPORT
  50 #include "opal/datatype/opal_datatype_cuda.h"
  51 #endif /* OPAL_CUDA_SUPPORT */
  52 #include "opal/util/info_subscriber.h"
  53 
  54 #include "ompi/info/info.h"
  55 #include "ompi/communicator/communicator.h"
  56 #include "ompi/mca/osc/osc.h"
  57 #include "ompi/mca/osc/base/base.h"
  58 #include "ompi/mca/osc/base/osc_base_obj_convert.h"
  59 #include "ompi/mca/pml/pml.h"
  60 #include "opal/mca/btl/base/base.h"
  61 #include "opal/mca/base/mca_base_pvar.h"
  62 #include "ompi/mca/bml/base/base.h"
  63 #include "ompi/mca/mtl/base/base.h"
  64 
  65 static int ompi_osc_rdma_component_register (void);
  66 static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
  67 static int ompi_osc_rdma_component_finalize (void);
  68 static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
  69                                           struct ompi_communicator_t *comm, struct opal_info_t *info,
  70                                           int flavor);
  71 static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
  72                                            struct ompi_communicator_t *comm, struct opal_info_t *info,
  73                                            int flavor, int *model);
  74 #if 0  // stale code?
  75 static int ompi_osc_rdma_set_info (struct ompi_win_t *win, struct opal_info_t *info);
  76 static int ompi_osc_rdma_get_info (struct ompi_win_t *win, struct opal_info_t **info_used);
  77 #endif
  78 static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl);
  79 static int ompi_osc_rdma_query_mtls (void);
  80 
  81 static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *key, char *value);
  82 
  83 static char *ompi_osc_rdma_btl_names;
  84 static char *ompi_osc_rdma_mtl_names;
  85 
  86 static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
  87     {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
  88     {.value = OMPI_OSC_RDMA_LOCKING_ON_DEMAND, .string = "on_demand"},
  89     {.string = NULL},
  90 };
  91 
  92 ompi_osc_rdma_component_t mca_osc_rdma_component = {
  93     .super = {
  94         .osc_version = {
  95             OMPI_OSC_BASE_VERSION_3_0_0,
  96             .mca_component_name = "rdma",
  97             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
  98                                   OMPI_RELEASE_VERSION),
  99             .mca_register_component_params = ompi_osc_rdma_component_register
 100         },
 101         .osc_data = {
 102             /* The component is not checkpoint ready */
 103             MCA_BASE_METADATA_PARAM_NONE
 104         },
 105         .osc_init = ompi_osc_rdma_component_init,
 106         .osc_query = ompi_osc_rdma_component_query,
 107         .osc_select = ompi_osc_rdma_component_select,
 108         .osc_finalize = ompi_osc_rdma_component_finalize
 109     }
 110 };
 111 
 112 ompi_osc_base_module_t ompi_osc_rdma_module_rdma_template = {
 113     .osc_win_attach = ompi_osc_rdma_attach,
 114     .osc_win_detach  = ompi_osc_rdma_detach,
 115     .osc_free = ompi_osc_rdma_free,
 116 
 117     .osc_put = ompi_osc_rdma_put,
 118     .osc_get = ompi_osc_rdma_get,
 119     .osc_accumulate = ompi_osc_rdma_accumulate,
 120     .osc_compare_and_swap = ompi_osc_rdma_compare_and_swap,
 121     .osc_fetch_and_op = ompi_osc_rdma_fetch_and_op,
 122     .osc_get_accumulate = ompi_osc_rdma_get_accumulate,
 123 
 124     .osc_rput = ompi_osc_rdma_rput,
 125     .osc_rget = ompi_osc_rdma_rget,
 126     .osc_raccumulate = ompi_osc_rdma_raccumulate,
 127     .osc_rget_accumulate = ompi_osc_rdma_rget_accumulate,
 128 
 129     .osc_fence = ompi_osc_rdma_fence_atomic,
 130 
 131     .osc_start = ompi_osc_rdma_start_atomic,
 132     .osc_complete = ompi_osc_rdma_complete_atomic,
 133     .osc_post = ompi_osc_rdma_post_atomic,
 134     .osc_wait = ompi_osc_rdma_wait_atomic,
 135     .osc_test = ompi_osc_rdma_test_atomic,
 136 
 137     .osc_lock = ompi_osc_rdma_lock_atomic,
 138     .osc_unlock = ompi_osc_rdma_unlock_atomic,
 139     .osc_lock_all = ompi_osc_rdma_lock_all_atomic,
 140     .osc_unlock_all = ompi_osc_rdma_unlock_all_atomic,
 141 
 142     .osc_sync = ompi_osc_rdma_sync,
 143     .osc_flush = ompi_osc_rdma_flush,
 144     .osc_flush_all = ompi_osc_rdma_flush_all,
 145     .osc_flush_local = ompi_osc_rdma_flush_local,
 146     .osc_flush_local_all = ompi_osc_rdma_flush_local_all,
 147 };
 148 
 149 /* look up parameters for configuring this window.  The code first
 150    looks in the info structure passed by the user, then it checks
 151    for a matching MCA variable. */
 152 static bool check_config_value_bool (char *key, opal_info_t *info)
 153 {
 154     int ret, flag, param;
 155     bool result = false;
 156     const bool *flag_value = &result;
 157 
 158     ret = opal_info_get_bool (info, key, &result, &flag);
 159     if (OMPI_SUCCESS == ret && flag) {
 160         return result;
 161     }
 162 
 163     param = mca_base_var_find("ompi", "osc", "rdma", key);
 164     if (0 <= param) {
 165         (void) mca_base_var_get_value(param, &flag_value, NULL, NULL);
 166     }
 167 
 168     return flag_value[0];
 169 }
 170 
 171 static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *value, void *obj)
 172 {
 173     ompi_win_t *win = (ompi_win_t *) obj;
 174     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 175     int offset = (int) (intptr_t) pvar->ctx;
 176 
 177     memcpy (value, (char *) module + offset, sizeof (unsigned long));
 178 
 179     return OMPI_SUCCESS;
 180 }
 181 
 182 static int ompi_osc_rdma_component_register (void)
 183 {
 184     char *description_str;
 185     mca_base_var_enum_t *new_enum;
 186 
 187     mca_osc_rdma_component.no_locks = false;
 188     opal_asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is "
 189              "not used. Info key of same name overrides this value (default: %s)",
 190              mca_osc_rdma_component.no_locks  ? "true" : "false");
 191     (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "no_locks", description_str,
 192                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
 193                                            MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks);
 194     free(description_str);
 195 
 196     mca_osc_rdma_component.acc_single_intrinsic = false;
 197     opal_asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
 198              "that will not use anything more than a single predefined datatype (default: %s)",
 199              mca_osc_rdma_component.acc_single_intrinsic  ? "true" : "false");
 200     (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic",
 201                                            description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
 202                                            MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic);
 203     free(description_str);
 204 
 205     mca_osc_rdma_component.acc_use_amo = true;
 206     opal_asprintf(&description_str, "Enable the use of network atomic memory operations when using single "
 207              "intrinsic optimizations. If not set network compare-and-swap will be "
 208              "used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
 209     (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", description_str,
 210                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP,
 211                                            &mca_osc_rdma_component.acc_use_amo);
 212     free(description_str);
 213 
 214     mca_osc_rdma_component.buffer_size = 32768;
 215     opal_asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size);
 216     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", description_str,
 217                                             MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
 218                                             MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
 219     free(description_str);
 220 
 221     mca_osc_rdma_component.max_attach = 32;
 222     opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
 223              "Keep in mind that each attached buffer will use a potentially limited "
 224              "resource (default: %d)", mca_osc_rdma_component.max_attach);
 225    (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
 226                                            MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
 227                                            MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
 228     free(description_str);
 229 
 230     mca_osc_rdma_component.priority = 101;
 231     opal_asprintf(&description_str, "Priority of the osc/rdma component (default: %d)",
 232              mca_osc_rdma_component.priority);
 233     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str,
 234                                             MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
 235                                             MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority);
 236     free(description_str);
 237 
 238     (void) mca_base_var_enum_create ("osc_rdma_locking_mode", ompi_osc_rdma_locking_modes, &new_enum);
 239 
 240     mca_osc_rdma_component.locking_mode = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL;
 241     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "locking_mode",
 242                                             "Locking mode to use for passive-target synchronization (default: two_level)",
 243                                             MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, OPAL_INFO_LVL_3,
 244                                             MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.locking_mode);
 245     OBJ_RELEASE(new_enum);
 246 
 247     ompi_osc_rdma_btl_names = "openib,ugni,uct,ucp";
 248     opal_asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying "
 249              "connectivity. Do not add a BTL to to this list unless it can reach all "
 250              "processes in any communicator used with an MPI window (default: %s)",
 251              ompi_osc_rdma_btl_names);
 252     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", description_str,
 253                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
 254                                             MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
 255     free(description_str);
 256 
 257     ompi_osc_rdma_mtl_names = "psm2";
 258     opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
 259              "osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
 260     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", description_str,
 261                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
 262                                             MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names);
 263     free(description_str);
 264 
 265     if (0 == access ("/dev/shm", W_OK)) {
 266         mca_osc_rdma_component.backing_directory = "/dev/shm";
 267     } else {
 268         mca_osc_rdma_component.backing_directory = ompi_process_info.proc_session_dir;
 269     }
 270 
 271     (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "backing_directory",
 272                                             "Directory to place backing files for memory windows. "
 273                                             "This directory should be on a local filesystem such as /tmp or "
 274                                             "/dev/shm (default: (linux) /dev/shm, (others) session directory)",
 275                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
 276                                             MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_rdma_component.backing_directory);
 277 
 278     /* register performance variables */
 279 
 280     (void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
 281                                              "Number of times put transaction were retried due to resource limitations",
 282                                              OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
 283                                              NULL, MCA_BASE_VAR_BIND_MPI_WIN, MCA_BASE_PVAR_FLAG_CONTINUOUS,
 284                                              ompi_osc_rdma_pvar_read, NULL, NULL,
 285                                              (void *) (intptr_t) offsetof (ompi_osc_rdma_module_t, put_retry_count));
 286 
 287     (void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "get_retry_count",
 288                                              "Number of times get transaction were retried due to resource limitations",
 289                                              OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
 290                                              NULL, MCA_BASE_VAR_BIND_MPI_WIN, MCA_BASE_PVAR_FLAG_CONTINUOUS,
 291                                              ompi_osc_rdma_pvar_read, NULL, NULL,
 292                                              (void *) (intptr_t) offsetof (ompi_osc_rdma_module_t, get_retry_count));
 293 
 294     return OMPI_SUCCESS;
 295 }
 296 
 297 static int ompi_osc_rdma_component_init (bool enable_progress_threads,
 298                                          bool enable_mpi_threads)
 299 {
 300     int ret;
 301 
 302     OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t);
 303     OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t);
 304     OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
 305     OBJ_CONSTRUCT(&mca_osc_rdma_component.modules, opal_hash_table_t);
 306 
 307     opal_hash_table_init(&mca_osc_rdma_component.modules, 2);
 308 
 309     OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t);
 310     ret = opal_free_list_init (&mca_osc_rdma_component.frags,
 311                                sizeof(ompi_osc_rdma_frag_t), 8,
 312                                OBJ_CLASS(ompi_osc_rdma_frag_t),
 313                                mca_osc_rdma_component.buffer_size, 8,
 314                                4, -1, 4, NULL, 0, NULL, NULL, NULL);
 315     if (OPAL_SUCCESS != ret) {
 316         opal_output_verbose(1, ompi_osc_base_framework.framework_output,
 317                             "%s:%d: opal_free_list_init_new failed: %d",
 318                             __FILE__, __LINE__, ret);
 319         return ret;
 320     }
 321 
 322     OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, opal_free_list_t);
 323     ret = opal_free_list_init (&mca_osc_rdma_component.requests,
 324                                sizeof(ompi_osc_rdma_request_t), 8,
 325                                OBJ_CLASS(ompi_osc_rdma_request_t), 0, 0,
 326                                0, -1, 32, NULL, 0, NULL, NULL, NULL);
 327     if (OPAL_SUCCESS != ret) {
 328         opal_output_verbose(1, ompi_osc_base_framework.framework_output,
 329                             "%s:%d: opal_free_list_init failed: %d\n",
 330                             __FILE__, __LINE__, ret);
 331     }
 332 
 333     return ret;
 334 }
 335 
 336 
 337 int ompi_osc_rdma_component_finalize (void)
 338 {
 339     size_t num_modules;
 340 
 341     if (0 != (num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) {
 342         opal_output(ompi_osc_base_framework.framework_output, "WARNING: There were %d Windows created but "
 343                     "not freed.", (int) num_modules);
 344     }
 345 
 346     OBJ_DESTRUCT(&mca_osc_rdma_component.frags);
 347     OBJ_DESTRUCT(&mca_osc_rdma_component.modules);
 348     OBJ_DESTRUCT(&mca_osc_rdma_component.lock);
 349     OBJ_DESTRUCT(&mca_osc_rdma_component.requests);
 350     OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc);
 351     OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
 352 
 353     return OMPI_SUCCESS;
 354 }
 355 
 356 
 357 static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
 358                                           struct ompi_communicator_t *comm, struct opal_info_t *info,
 359                                           int flavor)
 360 {
 361 
 362     if (MPI_WIN_FLAVOR_SHARED == flavor) {
 363         return -1;
 364     }
 365 
 366 #if OPAL_CUDA_SUPPORT
 367     /* GPU buffers are not supported by the rdma component */
 368     if (MPI_WIN_FLAVOR_CREATE == flavor) {
 369         if (opal_cuda_check_bufs(*base, NULL)) {
 370             return -1;
 371         }
 372     }
 373 #endif /* OPAL_CUDA_SUPPORT */
 374 
 375     if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) {
 376         return 5; /* this has to be lower that osc pt2pt default priority */
 377     }
 378 
 379     if (OMPI_SUCCESS != ompi_osc_rdma_query_btls (comm, NULL)) {
 380         return -1;
 381     }
 382 
 383 
 384     return mca_osc_rdma_component.priority;
 385 }
 386 
 387 static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) {
 388     ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions;
 389     int ret;
 390 
 391     /* store displacement unit */
 392     module->state->disp_unit = module->disp_unit;
 393 
 394     /* store region info */
 395     module->state->region_count = 1;
 396     region->base = (osc_rdma_base_t) (intptr_t) *base;
 397     region->len = size;
 398 
 399     if (module->selected_btl->btl_register_mem && size) {
 400         if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
 401             ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
 402                                           &module->base_handle);
 403             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 404                 return OMPI_ERR_OUT_OF_RESOURCE;
 405             }
 406 
 407             memcpy (region->btl_handle_data, module->base_handle, module->selected_btl->btl_registration_handle_size);
 408         } else {
 409             memcpy (region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
 410         }
 411     }
 412 
 413     return OMPI_SUCCESS;
 414 }
 415 
 416 static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
 417 {
 418     size_t total_size, local_rank_array_size, leader_peer_data_size;
 419     ompi_osc_rdma_peer_t *my_peer;
 420     int ret, my_rank;
 421 
 422     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state");
 423 
 424     my_rank = ompi_comm_rank (module->comm);
 425 
 426     local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT(module);
 427     leader_peer_data_size = module->region_size * module->node_count;
 428 
 429     /* allocate anything that will be accessed remotely in the same region. this cuts down on the number of
 430      * registration handles needed to access this data. */
 431     total_size = local_rank_array_size + module->region_size +
 432         module->state_size + leader_peer_data_size;
 433 
 434     if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 435         total_size += size;
 436     }
 437 
 438     /* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
 439      * (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
 440      * segment but placing it there simplifies the peer data fetch and cleanup code. */
 441 
 442     module->rank_array = calloc (total_size, 1);
 443     if (OPAL_UNLIKELY(NULL == module->rank_array)) {
 444         return OMPI_ERR_OUT_OF_RESOURCE;
 445     }
 446 
 447 // Note, the extra module->region_size space added after local_rank_array_size
 448 // is unused but is there to match what happens in allocte_state_shared()
 449 // This allows module->state_offset to be uniform across the ranks which
 450 // is part of how they pull peer info from each other.
 451     module->state_offset = local_rank_array_size + module->region_size;
 452 
 453     module->state = (ompi_osc_rdma_state_t *) ((intptr_t) module->rank_array + module->state_offset);
 454     module->node_comm_info = (unsigned char *) ((intptr_t) module->state + module->state_size);
 455 
 456     if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 457         *base = (void *) ((intptr_t) module->node_comm_info + leader_peer_data_size);
 458     }
 459 
 460     /* just go ahead and register the whole segment */
 461     ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->rank_array, total_size,
 462                                   MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
 463     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 464         return ret;
 465     }
 466 
 467     if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
 468         ret = ompi_osc_rdma_initialize_region (module, base, size);
 469         if (OMPI_SUCCESS != ret) {
 470             return ret;
 471         }
 472     }
 473 
 474     ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer);
 475     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 476         return ret;
 477     }
 478 
 479     ret = ompi_osc_module_add_peer (module, my_peer);
 480     if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
 481         OBJ_RELEASE(my_peer);
 482         return ret;
 483     }
 484 
 485     module->my_peer = my_peer;
 486     module->free_after = module->rank_array;
 487     my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
 488     my_peer->state = (uint64_t) (uintptr_t) module->state;
 489 
 490     if (module->use_cpu_atomics) {
 491         /* all peers are local or it is safe to mix cpu and nic atomics */
 492         my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
 493     } else {
 494         /* use my endpoint handle to modify the peer's state */
 495         my_peer->state_handle = module->state_handle;
 496         my_peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank);
 497     }
 498 
 499     if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
 500         ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) my_peer;
 501 
 502         ex_peer->super.base = (intptr_t) *base;
 503 
 504         if (!module->same_size) {
 505             ex_peer->size = size;
 506         }
 507 
 508         if (!module->use_cpu_atomics) {
 509             if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 510                 /* base is local and cpu atomics are available */
 511                 ex_peer->super.base_handle = module->state_handle;
 512             } else {
 513                 ex_peer->super.base_handle = module->base_handle;
 514             }
 515         }
 516     }
 517 
 518     return OMPI_SUCCESS;
 519 }
 520 
 521 struct _local_data {
 522     int    rank;
 523     size_t size;
 524 };
 525 
 526 static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, size_t size)
 527 {
 528     ompi_communicator_t *shared_comm;
 529     unsigned long offset, total_size;
 530     unsigned long state_base, data_base;
 531     int local_rank, local_size, ret;
 532     size_t local_rank_array_size, leader_peer_data_size, my_base_offset = 0;
 533     int my_rank = ompi_comm_rank (module->comm);
 534     int global_size = ompi_comm_size (module->comm);
 535     ompi_osc_rdma_region_t *state_region;
 536     struct _local_data *temp;
 537     char *data_file;
 538 
 539     shared_comm = module->shared_comm;
 540 
 541     local_rank = ompi_comm_rank (shared_comm);
 542     local_size = ompi_comm_size (shared_comm);
 543 
 544     /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
 545     module->use_cpu_atomics = local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
 546 
 547     if (1 == local_size) {
 548         /* no point using a shared segment if there are no other processes on this node */
 549         return allocate_state_single (module, base, size);
 550     }
 551 
 552     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating shared internal state");
 553 
 554     local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT (module);
 555     leader_peer_data_size = module->region_size * module->node_count;
 556 
 557     /* calculate base offsets */
 558     module->state_offset = state_base = local_rank_array_size + module->region_size;
 559     data_base = state_base + leader_peer_data_size + module->state_size * local_size;
 560 
 561     do {
 562         temp = calloc (local_size, sizeof (temp[0]));
 563         if (NULL == temp) {
 564             ret = OMPI_ERR_OUT_OF_RESOURCE;
 565             break;
 566         }
 567 
 568         temp[local_rank].rank = my_rank;
 569         temp[local_rank].size = size;
 570 
 571         /* gather the local sizes and ranks */
 572         ret = shared_comm->c_coll->coll_allgather (MPI_IN_PLACE, sizeof (*temp), MPI_BYTE, temp, sizeof (*temp),
 573                                                   MPI_BYTE, shared_comm, shared_comm->c_coll->coll_allgather_module);
 574         if (OMPI_SUCCESS != ret) {
 575             break;
 576         }
 577 
 578         total_size = data_base;
 579 
 580         if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 581             for (int i = 0 ; i < local_size ; ++i) {
 582                 if (local_rank == i) {
 583                     my_base_offset = total_size;
 584                 }
 585                 total_size += temp[i].size;
 586             }
 587         }
 588 
 589         if (0 == local_rank) {
 590             /* allocate the shared memory segment */
 591             ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d",
 592                             mca_osc_rdma_component.backing_directory, ompi_process_info.nodename,
 593                             OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm));
 594             if (0 > ret) {
 595                 ret = OMPI_ERR_OUT_OF_RESOURCE;
 596                 break;
 597             }
 598 
 599             /* allocate enough space for the state + data for all local ranks */
 600             ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
 601             free (data_file);
 602             if (OPAL_SUCCESS != ret) {
 603                 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
 604                 break;
 605             }
 606         }
 607 
 608         ret = module->comm->c_coll->coll_bcast (&module->seg_ds, sizeof (module->seg_ds), MPI_BYTE, 0,
 609                                                shared_comm, shared_comm->c_coll->coll_bcast_module);
 610         if (OMPI_SUCCESS != ret) {
 611             break;
 612         }
 613 
 614         module->segment_base = opal_shmem_segment_attach (&module->seg_ds);
 615         if (NULL == module->segment_base) {
 616             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to attach to the shared memory segment");
 617             ret = OPAL_ERROR;
 618             break;
 619         }
 620 
 621         if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 622             *base = (void *)((intptr_t) module->segment_base + my_base_offset);
 623             memset (*base, 0, size);
 624         }
 625 
 626         module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
 627         /* put local state region data after the rank array */
 628         state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size);
 629         module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
 630 
 631         /* all local ranks share the array containing the peer data of leader ranks */
 632         module->node_comm_info = (unsigned char *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_size);
 633 
 634         /* initialize my state */
 635         memset (module->state, 0, module->state_size);
 636 
 637         /* barrier to make sure all ranks have attached and initialized */
 638         shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
 639 
 640         if (0 == local_rank) {
 641             /* unlink the shared memory backing file */
 642             opal_shmem_unlink (&module->seg_ds);
 643             /* just go ahead and register the whole segment */
 644             ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
 645                                           &module->state_handle);
 646             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 647                 break;
 648             }
 649 
 650             state_region->base = (intptr_t) module->segment_base;
 651             if (module->state_handle) {
 652                 memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
 653             }
 654         }
 655 
 656         /* barrier to make sure memory is registered */
 657         shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
 658 
 659         if (MPI_WIN_FLAVOR_CREATE == module->flavor) {
 660             ret = ompi_osc_rdma_initialize_region (module, base, size);
 661             if (OMPI_SUCCESS != ret) {
 662                 break;
 663             }
 664         }
 665 
 666         if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 667             ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions;
 668             module->state->disp_unit = module->disp_unit;
 669             module->state->region_count = 1;
 670             region->base = state_region->base + my_base_offset;
 671             region->len = size;
 672             if (module->selected_btl->btl_register_mem) {
 673                 memcpy (region->btl_handle_data, state_region->btl_handle_data, module->selected_btl->btl_registration_handle_size);
 674             }
 675         }
 676 
 677         /* barrier to make sure all ranks have set up their region data */
 678         shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
 679 
 680         offset = data_base;
 681         for (int i = 0 ; i < local_size ; ++i) {
 682             /* local pointer to peer's state */
 683             ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
 684             ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions;
 685             ompi_osc_rdma_peer_extended_t *ex_peer;
 686             ompi_osc_rdma_peer_t *peer;
 687             int peer_rank = temp[i].rank;
 688 
 689             ret = ompi_osc_rdma_new_peer (module, peer_rank, &peer);
 690             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 691                 break;
 692             }
 693 
 694             ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
 695 
 696             /* set up peer state */
 697             if (module->use_cpu_atomics) {
 698                 /* all peers are local or it is safe to mix cpu and nic atomics */
 699                 peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
 700                 peer->state = (osc_rdma_counter_t) peer_state;
 701                 peer->state_endpoint = NULL;
 702             } else {
 703                 /* use my endpoint handle to modify the peer's state */
 704                 if (module->selected_btl->btl_register_mem) {
 705                     peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
 706                 }
 707                 peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
 708                 peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank);
 709             }
 710 
 711             if (my_rank == peer_rank) {
 712                 module->my_peer = peer;
 713             }
 714 
 715             if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || MPI_WIN_FLAVOR_CREATE == module->flavor) {
 716                 /* use the peer's BTL endpoint directly */
 717                 peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
 718             } else if (!module->use_cpu_atomics && temp[i].size) {
 719                 /* use the local leader's endpoint */
 720                 peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank);
 721             }
 722 
 723             ompi_osc_module_add_peer (module, peer);
 724 
 725             if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
 726                 if (module->use_cpu_atomics && peer_rank == my_rank) {
 727                     peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
 728                 }
 729                 /* nothing more to do */
 730                 continue;
 731             } else if (0 == temp[i].size) {
 732                 /* nothing more to do */
 733                 continue;
 734             }
 735 
 736             /* finish setting up the local peer structure for win allocate/create */
 737             if (!(module->same_disp_unit && module->same_size)) {
 738                 ex_peer->disp_unit = peer_state->disp_unit;
 739                 ex_peer->size = temp[i].size;
 740             }
 741 
 742             if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) {
 743                 /* base is local and cpu atomics are available */
 744                 if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 745                     ex_peer->super.base = (uintptr_t) module->segment_base + offset;
 746                 } else {
 747                     ex_peer->super.base = (uintptr_t) *base;
 748                 }
 749 
 750                 peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
 751                 offset += temp[i].size;
 752             } else {
 753                 ex_peer->super.base = peer_region->base;
 754 
 755                 if (module->selected_btl->btl_register_mem) {
 756                     ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data;
 757                 }
 758             }
 759         }
 760     } while (0);
 761 
 762     free (temp);
 763 
 764     return ret;
 765 }
 766 
 767 static int ompi_osc_rdma_query_mtls (void)
 768 {
 769     char **mtls_to_use;
 770 
 771     mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
 772     if (mtls_to_use && ompi_mtl_base_selected_component) {
 773         for (int i = 0 ; mtls_to_use[i] ; ++i) {
 774             if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
 775                 opal_argv_free(mtls_to_use);
 776                 return OMPI_SUCCESS;
 777             }
 778         }
 779     }
 780     opal_argv_free(mtls_to_use);
 781     return -1;
 782 }
 783 
 784 static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
 785 {
 786     struct mca_btl_base_module_t **possible_btls = NULL;
 787     int comm_size = ompi_comm_size (comm);
 788     int rc = OMPI_SUCCESS, max_btls = 0;
 789     unsigned int selected_latency = INT_MAX;
 790     struct mca_btl_base_module_t *selected_btl = NULL;
 791     mca_btl_base_selected_module_t *item;
 792     int *btl_counts = NULL;
 793     char **btls_to_use;
 794     void *tmp;
 795 
 796     btls_to_use = opal_argv_split (ompi_osc_rdma_btl_names, ',');
 797     if (btls_to_use) {
 798         /* rdma and atomics are only supported with BTLs at the moment */
 799         OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
 800             for (int i = 0 ; btls_to_use[i] ; ++i) {
 801                 if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
 802                     continue;
 803                 }
 804 
 805                 if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA &&
 806                     (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) {
 807                     if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
 808                         selected_btl = item->btl_module;
 809                     }
 810                 }
 811             }
 812         }
 813 
 814         opal_argv_free (btls_to_use);
 815     }
 816 
 817     if (btl) {
 818         *btl = selected_btl;
 819     }
 820 
 821     if (NULL != selected_btl) {
 822         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s",
 823                          selected_btl->btl_component->btl_version.mca_component_name);
 824         return OMPI_SUCCESS;
 825     }
 826 
 827     for (int i = 0 ; i < comm_size ; ++i) {
 828         ompi_proc_t *proc = ompi_comm_peer_lookup (comm, i);
 829         mca_bml_base_endpoint_t *endpoint;
 830         int num_btls, prev_max;
 831 
 832         endpoint = mca_bml_base_get_endpoint (proc);
 833         if (NULL == endpoint) {
 834             /* can't continue if some peer is unreachable */
 835             rc = OMPI_ERR_UNREACH;
 836             break;
 837         }
 838 
 839         num_btls = mca_bml_base_btl_array_get_size (&endpoint->btl_rdma);
 840         if (0 == num_btls) {
 841             rc = OMPI_ERR_NOT_AVAILABLE;
 842             /* at least one rank doesn't have an RDMA capable btl */
 843             break;
 844         }
 845 
 846         prev_max = max_btls;
 847 
 848         max_btls = (max_btls > num_btls) ? max_btls : num_btls;
 849 
 850         tmp = realloc (possible_btls, sizeof (void *) * max_btls);
 851         if (NULL == tmp) {
 852             rc = OMPI_ERR_OUT_OF_RESOURCE;
 853             break;
 854         }
 855         possible_btls = tmp;
 856 
 857         for (int j = prev_max ; j < max_btls ; ++j) {
 858             possible_btls[j] = NULL;
 859         }
 860 
 861         tmp = realloc (btl_counts, sizeof (int) * max_btls);
 862         if (NULL == tmp) {
 863             rc = OMPI_ERR_OUT_OF_RESOURCE;
 864             break;
 865         }
 866         btl_counts = tmp;
 867 
 868         for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
 869             /* for this implementation we need only compare-and-swap and fetch-and-add */
 870             if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
 871                 (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags &
 872                                                                      MCA_BTL_ATOMIC_SUPPORTS_ADD)) {
 873                 for (int j = 0 ; j < max_btls ; ++j) {
 874                     if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
 875                         ++btl_counts[j];
 876                         break;
 877                     } else if (NULL == possible_btls[j]) {
 878                         possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl;
 879                         btl_counts[j] = 1;
 880                         break;
 881                     }
 882                 }
 883             }
 884         }
 885     }
 886 
 887     if (OMPI_SUCCESS != rc) {
 888         free (possible_btls);
 889         free (btl_counts);
 890 
 891         /* no btl = no rdma/atomics */
 892         return OMPI_ERR_NOT_AVAILABLE;
 893     }
 894 
 895     for (int i = 0 ; i < max_btls ; ++i) {
 896         int btl_count = btl_counts[i];
 897 
 898         if (NULL == possible_btls[i]) {
 899             break;
 900         }
 901 
 902         if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
 903             /* do not need to use the btl for self communication */
 904             btl_count++;
 905         }
 906 
 907         if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) {
 908             selected_btl = possible_btls[i];
 909             selected_latency = possible_btls[i]->btl_latency;
 910         }
 911     }
 912 
 913     free (possible_btls);
 914     free (btl_counts);
 915 
 916     if (btl) {
 917         *btl = selected_btl;
 918     }
 919 
 920     if (NULL == selected_btl) {
 921         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found");
 922         /* no btl = no rdma/atomics */
 923         return OMPI_ERR_NOT_AVAILABLE;
 924     }
 925 
 926     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s",
 927                      selected_btl->btl_component->btl_version.mca_component_name);
 928 
 929     return OMPI_SUCCESS;
 930 }
 931 
 932 static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
 933 {
 934     ompi_osc_rdma_region_t *my_data;
 935     int ret, global_result;
 936     int my_rank = ompi_comm_rank (module->comm);
 937     int comm_size = ompi_comm_size (module->comm);
 938     ompi_osc_rdma_rank_data_t *temp;
 939 
 940     do {
 941         temp = malloc (sizeof (*temp) * comm_size);
 942         if (NULL == temp) {
 943             ret = OMPI_ERR_OUT_OF_RESOURCE;
 944             break;
 945         }
 946 
 947         /* fill in rank -> node translation */
 948         temp[my_rank].node_id = module->node_id;
 949         temp[my_rank].rank = ompi_comm_rank (module->shared_comm);
 950 
 951         ret = module->comm->c_coll->coll_allgather (MPI_IN_PLACE, 1, MPI_2INT, temp, 1, MPI_2INT,
 952                                                    module->comm, module->comm->c_coll->coll_allgather_module);
 953         if (OMPI_SUCCESS != ret) {
 954             break;
 955         }
 956 
 957         if (0 == ompi_comm_rank (module->shared_comm)) {
 958             /* fill in my part of the node array */
 959             my_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + ompi_comm_rank (module->local_leaders) *
 960                                                   module->region_size);
 961 
 962             my_data->base = (uint64_t) (intptr_t) module->rank_array;
 963             /* store my rank in the length field */
 964             my_data->len = (osc_rdma_size_t) my_rank;
 965 
 966             if (module->selected_btl->btl_register_mem) {
 967                 memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
 968             }
 969 
 970             /* gather state data at each node leader */
 971             if (ompi_comm_size (module->local_leaders) > 1) {
 972                 ret = module->local_leaders->c_coll->coll_allgather (MPI_IN_PLACE, module->region_size, MPI_BYTE, module->node_comm_info,
 973                                                                     module->region_size, MPI_BYTE, module->local_leaders,
 974                                                                     module->local_leaders->c_coll->coll_allgather_module);
 975                 if (OMPI_SUCCESS != ret) {
 976                     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "leader allgather failed with ompi error code %d", ret);
 977                     break;
 978                 }
 979             }
 980 
 981             int base_rank = ompi_comm_rank (module->local_leaders) * ((comm_size + module->node_count - 1) / module->node_count);
 982 
 983             /* fill in the local part of the rank -> node map */
 984             for (int i = 0 ; i < RANK_ARRAY_COUNT(module) ; ++i) {
 985                 int save_rank = base_rank + i;
 986                 if (save_rank >= comm_size) {
 987                     break;
 988                 }
 989 
 990                 module->rank_array[i] = temp[save_rank];
 991             }
 992         }
 993 
 994         free (temp);
 995     } while (0);
 996 
 997 
 998     ret = module->comm->c_coll->coll_allreduce (&ret, &global_result, 1, MPI_INT, MPI_MIN, module->comm,
 999                                                module->comm->c_coll->coll_allreduce_module);
1000 
1001     if (OMPI_SUCCESS != ret) {
1002         global_result = ret;
1003     }
1004 
1005     /* none of these communicators are needed anymore so free them now*/
1006     if (MPI_COMM_NULL != module->local_leaders) {
1007         ompi_comm_free (&module->local_leaders);
1008     }
1009 
1010     if (MPI_COMM_NULL != module->shared_comm) {
1011         ompi_comm_free (&module->shared_comm);
1012     }
1013 
1014     return global_result;
1015 }
1016 
1017 static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module)
1018 {
1019     int comm_rank, ret, local_rank;
1020     int values[2] = {0, 0};
1021 
1022     /* create a shared communicator to handle communication about the local segment */
1023     ret = ompi_comm_split_type (module->comm, MPI_COMM_TYPE_SHARED, 0, NULL, &module->shared_comm);
1024     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1025         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create a shared memory communicator. error code %d", ret);
1026         return ret;
1027     }
1028 
1029     local_rank = ompi_comm_rank (module->shared_comm);
1030 
1031     comm_rank = ompi_comm_rank (module->comm);
1032 
1033     ret = ompi_comm_split (module->comm, (0 == local_rank) ? 0 : MPI_UNDEFINED, comm_rank, &module->local_leaders,
1034                            false);
1035     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1036         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create local leaders communicator. error code %d", ret);
1037         return ret;
1038     }
1039 
1040     if (0 == local_rank) {
1041         values[0] = ompi_comm_size (module->local_leaders);
1042         values[1] = ompi_comm_rank (module->local_leaders);
1043     }
1044 
1045     if (ompi_comm_size (module->shared_comm) > 1) {
1046         ret = module->shared_comm->c_coll->coll_bcast (values, 2, MPI_INT, 0, module->shared_comm,
1047                                                       module->shared_comm->c_coll->coll_bcast_module);
1048         if (OMPI_SUCCESS != ret) {
1049             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to broadcast local data. error code %d", ret);
1050             return ret;
1051         }
1052     }
1053 
1054     module->node_count = values[0];
1055     module->node_id = values[1];
1056 
1057     return OMPI_SUCCESS;
1058 }
1059 
1060 /**
1061  * @brief check the displacement unit and size against peers
1062  *
1063  * @param[in] module      osc rdma module
1064  * @param[in] disp_unit   the displacement unit for this process
1065  * @param[in] size        the window size for this process
1066  *
1067  * This function checks if all ranks have the same displacement unit or size and sets the appropriate
1068  * flags on the module.
1069  */
1070 static int ompi_osc_rdma_check_parameters (ompi_osc_rdma_module_t *module, int disp_unit, size_t size)
1071 {
1072     long values[4];
1073     int ret;
1074 
1075     if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || (module->same_size && module->same_disp_unit)) {
1076         /* done */
1077         return OMPI_SUCCESS;
1078     }
1079 
1080     /* check displacements and sizes */
1081     values[0] = disp_unit;
1082     values[1] = -disp_unit;
1083     values[2] = size;
1084     values[3] = -(ssize_t) size;
1085 
1086     ret = module->comm->c_coll->coll_allreduce (MPI_IN_PLACE, values, 4, MPI_LONG, MPI_MIN, module->comm,
1087                                                module->comm->c_coll->coll_allreduce_module);
1088     if (OMPI_SUCCESS != ret) {
1089         return ret;
1090     }
1091 
1092     if (values[0] == -values[1]) {
1093         /* same displacement */
1094         module->same_disp_unit = true;
1095     }
1096 
1097     if (values[2] == -values[3]) {
1098         /* same size */
1099         module->same_size = true;
1100     }
1101 
1102     return OMPI_SUCCESS;
1103 }
1104 
1105 
1106 static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
1107                                            struct ompi_communicator_t *comm, struct opal_info_t *info,
1108                                            int flavor, int *model)
1109 {
1110     ompi_osc_rdma_module_t *module = NULL;
1111     int world_size = ompi_comm_size (comm);
1112     int init_limit = 256;
1113     int ret;
1114     char *name;
1115 
1116     /* the osc/sm component is the exclusive provider for support for shared
1117      * memory windows */
1118     if (MPI_WIN_FLAVOR_SHARED == flavor) {
1119         return OMPI_ERR_NOT_SUPPORTED;
1120     }
1121 
1122     /* create module structure with all fields initialized to zero */
1123     module = (ompi_osc_rdma_module_t *) calloc (1, sizeof (ompi_osc_rdma_module_t));
1124     if (NULL == module) {
1125         return OMPI_ERR_OUT_OF_RESOURCE;
1126     }
1127 
1128     /* initialize the objects, so that always free in cleanup */
1129     OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t);
1130     OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
1131     OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
1132     OBJ_CONSTRUCT(&module->peer_lock, opal_mutex_t);
1133     OBJ_CONSTRUCT(&module->all_sync, ompi_osc_rdma_sync_t);
1134 
1135     module->same_disp_unit = check_config_value_bool ("same_disp_unit", info);
1136     module->same_size      = check_config_value_bool ("same_size", info);
1137     module->no_locks       = check_config_value_bool ("no_locks", info);
1138     module->locking_mode   = mca_osc_rdma_component.locking_mode;
1139     module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
1140     module->acc_use_amo = mca_osc_rdma_component.acc_use_amo;
1141 
1142     module->all_sync.module = module;
1143 
1144     module->flavor = flavor;
1145     module->win = win;
1146     module->disp_unit = disp_unit;
1147     module->size = size;
1148 
1149     /* set the module so we properly cleanup */
1150     win->w_osc_module = (ompi_osc_base_module_t*) module;
1151 
1152     if (!module->no_locks) {
1153         if (world_size > init_limit) {
1154             ret = opal_hash_table_init (&module->outstanding_locks, init_limit);
1155             if (OPAL_SUCCESS != ret) {
1156                 ompi_osc_rdma_free (win);
1157                 return ret;
1158             }
1159         } else {
1160             module->outstanding_lock_array = calloc (world_size, sizeof (module->outstanding_lock_array[0]));
1161             if (NULL == module->outstanding_lock_array) {
1162                 ompi_osc_rdma_free (win);
1163                 return OMPI_ERR_OUT_OF_RESOURCE;
1164             }
1165         }
1166     }
1167 
1168     ret = ompi_comm_dup(comm, &module->comm);
1169     if (OMPI_SUCCESS != ret) {
1170         ompi_osc_rdma_free (win);
1171         return ret;
1172     }
1173 
1174     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "creating osc/rdma window of flavor %d with id %d",
1175                      flavor, ompi_comm_get_cid(module->comm));
1176 
1177     /* peer data */
1178     if (world_size > init_limit) {
1179         OBJ_CONSTRUCT(&module->peer_hash, opal_hash_table_t);
1180         ret = opal_hash_table_init (&module->peer_hash, init_limit);
1181     } else {
1182         module->peer_array = calloc (world_size, sizeof (ompi_osc_rdma_peer_t *));
1183         if (NULL == module->peer_array) {
1184             ret = OMPI_ERR_OUT_OF_RESOURCE;
1185         }
1186     }
1187 
1188     if (OPAL_SUCCESS != ret) {
1189         ompi_osc_rdma_free (win);
1190         return ret;
1191     }
1192 
1193     /* find rdma capable endpoints */
1194     ret = ompi_osc_rdma_query_btls (module->comm, &module->selected_btl);
1195     if (OMPI_SUCCESS != ret) {
1196         ompi_osc_rdma_free (win);
1197         return ret;
1198     }
1199 
1200     /* calculate and store various structure sizes */
1201 
1202     module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
1203 
1204     module->state_size = sizeof (ompi_osc_rdma_state_t);
1205 
1206     if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
1207         module->state_size += module->region_size;
1208     } else {
1209         module->state_size += mca_osc_rdma_component.max_attach * module->region_size;
1210     }
1211 /*
1212  * These are the info's that this module is interested in
1213  */
1214     opal_infosubscribe_subscribe(&win->super, "no_locks", "false", ompi_osc_rdma_set_no_lock_info);
1215 
1216 /*
1217  * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1218  * to be used anywhere.  If that changes, they should be subscribed
1219  */
1220 
1221     /* fill in the function pointer part */
1222     memcpy(&module->super, &ompi_osc_rdma_module_rdma_template, sizeof(module->super));
1223 
1224     ret = ompi_osc_rdma_check_parameters (module, disp_unit, size);
1225     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1226         ompi_osc_rdma_free (win);
1227         return ret;
1228     }
1229 
1230     ret = ompi_osc_rdma_create_groups (module);
1231     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1232         ompi_osc_rdma_free (win);
1233         return ret;
1234     }
1235 
1236     /* fill in our part */
1237     ret = allocate_state_shared (module, base, size);
1238     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1239         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate internal state");
1240         ompi_osc_rdma_free (win);
1241         return ret;
1242     }
1243 
1244     if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
1245         /* allocate space to store local btl handles for attached regions */
1246         module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
1247                                                                      sizeof (module->dynamic_handles[0]));
1248         if (NULL == module->dynamic_handles) {
1249             ompi_osc_rdma_free (win);
1250             return OMPI_ERR_OUT_OF_RESOURCE;
1251         }
1252     }
1253 
1254     /* lock data */
1255     if (module->no_locks) {
1256         win->w_flags |= OMPI_WIN_NO_LOCKS;
1257     }
1258 
1259     if (module->same_size) {
1260         win->w_flags |= OMPI_WIN_SAME_SIZE;
1261     }
1262 
1263     if (module->same_disp_unit) {
1264         win->w_flags |= OMPI_WIN_SAME_DISP;
1265     }
1266 
1267     /* update component data */
1268     OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
1269     ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules,
1270                                            ompi_comm_get_cid(module->comm),
1271                                            module);
1272     OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
1273     if (OMPI_SUCCESS != ret) {
1274         ompi_osc_rdma_free (win);
1275         return ret;
1276     }
1277 
1278     /* fill in window information */
1279     *model = MPI_WIN_UNIFIED;
1280     win->w_osc_module = (ompi_osc_base_module_t*) module;
1281     opal_asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm));
1282     ompi_win_set_name(win, name);
1283     free(name);
1284 
1285     /* sync memory - make sure all initialization completed */
1286     opal_atomic_mb();
1287 
1288     ret = ompi_osc_rdma_share_data (module);
1289     if (OMPI_SUCCESS != ret) {
1290         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers");
1291         ompi_osc_rdma_free (win);
1292     } else {
1293         /* for now the leader is always rank 0 in the communicator */
1294         module->leader = ompi_osc_rdma_module_peer (module, 0);
1295 
1296         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d",
1297                          ompi_comm_get_cid(module->comm));
1298     }
1299 
1300     return ret;
1301 }
1302 
1303 
1304 static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *key, char *value)
1305 {
1306 
1307     struct ompi_win_t *win = (struct ompi_win_t*) obj;
1308     ompi_osc_rdma_module_t *module = GET_MODULE(win);
1309     bool temp;
1310 
1311     temp = opal_str_to_bool(value);
1312     if (temp && !module->no_locks) {
1313         /* clean up the lock hash. it is up to the user to ensure no lock is
1314          * outstanding from this process when setting the info key */
1315         OBJ_DESTRUCT(&module->outstanding_locks);
1316         OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
1317 
1318         module->no_locks = true;
1319     } else if (!temp && module->no_locks) {
1320         int world_size = ompi_comm_size (module->comm);
1321         int init_limit = world_size > 256 ? 256 : world_size;
1322         int ret;
1323 
1324         ret = opal_hash_table_init (&module->outstanding_locks, init_limit);
1325         if (OPAL_SUCCESS != ret) {
1326             module->no_locks = true;
1327         }
1328 
1329         module->no_locks = false;
1330     }
1331     /* enforce collectiveness... */
1332     module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
1333 /*
1334  * Accept any value
1335  */
1336     return module->no_locks ? "true" : "false";
1337 }

/* [<][>][^][v][top][bottom][index][help] */