root/ompi/mca/osc/rdma/osc_rdma_peer.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_osc_rdma_peer_btl_endpoint
  2. ompi_osc_rdma_new_peer
  3. ompi_osc_rdma_peer_setup
  4. ompi_osc_rdma_peer_lookup_internal
  5. ompi_osc_rdma_peer_lookup
  6. ompi_osc_rdma_peer_construct
  7. ompi_osc_rdma_peer_destruct
  8. ompi_osc_rdma_peer_basic_construct
  9. ompi_osc_rdma_peer_basic_destruct
  10. ompi_osc_rdma_peer_dynamic_construct
  11. ompi_osc_rdma_peer_dynamic_destruct

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
   4  *                         reserved.
   5  * Copyright (c) 2015      Research Organization for Information Science
   6  *                         and Technology (RIST). All rights reserved.
   7  * Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 
  15 #include "ompi_config.h"
  16 
  17 #ifdef HAVE_ALLOCA_H
  18 #include <alloca.h>
  19 #endif
  20 
  21 #include "osc_rdma_comm.h"
  22 
  23 #include "ompi/mca/bml/base/base.h"
  24 
  25 #define NODE_ID_TO_RANK(module, peer_data, node_id) ((int)(peer_data)->len)
  26 
  27 /**
  28  * @brief find the btl endpoint for a process
  29  *
  30  * @param[in] module         osc rdma module
  31  * @param[in] peer_id        process rank in the module communicator
  32  *
  33  * @returns NULL on error
  34  * @returns btl endpoint on success
  35  */
  36 struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id)
  37 {
  38     ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id);
  39     mca_bml_base_endpoint_t *bml_endpoint;
  40     int num_btls;
  41 
  42     /* for not just use the bml to get the btl endpoint */
  43     bml_endpoint = mca_bml_base_get_endpoint (proc);
  44 
  45     num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
  46 
  47     for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) {
  48         if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btl) {
  49             return bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint;
  50         }
  51     }
  52 
  53     /* very unlikely. if this happened the btl section process is broken */
  54     return NULL;
  55 }
  56 
  57 int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) {
  58     struct mca_btl_base_endpoint_t *endpoint;
  59     ompi_osc_rdma_peer_t *peer;
  60 
  61     *peer_out = NULL;
  62 
  63     endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
  64     if (OPAL_UNLIKELY(NULL == endpoint && !((module->selected_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) &&
  65                                             peer_id == ompi_comm_rank (module->comm)))) {
  66         return OMPI_ERR_UNREACH;
  67     }
  68 
  69     if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
  70         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_dynamic_t);
  71     } else if (module->same_size && module->same_disp_unit) {
  72         /* use a smaller peer object when same_size and same_disp_unit are set */
  73         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_basic_t);
  74     } else {
  75         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_extended_t);
  76     }
  77 
  78     peer->data_endpoint = endpoint;
  79     peer->rank          = peer_id;
  80 
  81     *peer_out = peer;
  82 
  83     return OMPI_SUCCESS;
  84 }
  85 
  86 /**
  87  * @brief finish initializing a peer object
  88  *
  89  * @param[in] module         osc rdma module
  90  * @param[in] peer           peer object to set up
  91  *
  92  * This function reads the registration handle and state pointer from the peer that holds that data. If necessary
  93  * it will then ready information about the peer from its state data structure. This information includes the
  94  * displacement unit, base pointer, window size, and registation handle (if applicable).
  95  */
  96 static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
  97 {
  98     ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
  99     uint64_t peer_data_size;
 100     uint64_t peer_data_offset, array_pointer;
 101     struct mca_btl_base_endpoint_t *array_endpoint;
 102     ompi_osc_rdma_region_t *array_peer_data, *node_peer_data;
 103     ompi_osc_rdma_rank_data_t rank_data;
 104     int registration_handle_size = 0;
 105     int node_id, node_rank, array_index;
 106     int ret, disp_unit;
 107     char *peer_data;
 108 
 109     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank);
 110 
 111     if (module->selected_btl->btl_register_mem) {
 112         registration_handle_size = module->selected_btl->btl_registration_handle_size;
 113     }
 114 
 115     /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
 116      * calculates the node and offset the mapping can be found. once the mapping has been read the state
 117      * part of the peer structure can be initialized. */
 118     node_id = peer->rank / RANK_ARRAY_COUNT(module);
 119     array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
 120 
 121     /* the node leader rank is stored in the length field */
 122     node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
 123     array_index = peer->rank % RANK_ARRAY_COUNT(module);
 124 
 125     array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
 126 
 127     /* lookup the btl endpoint needed to retrieve the mapping */
 128     array_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, node_rank);
 129     if (OPAL_UNLIKELY(NULL == array_endpoint)) {
 130         return OMPI_ERR_UNREACH;
 131     }
 132 
 133     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
 134                      ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
 135 
 136     ret = ompi_osc_get_data_blocking (module, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
 137                                       &rank_data, sizeof (rank_data));
 138     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 139         return ret;
 140     }
 141 
 142     /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
 143      * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
 144      * of this by re-using the endpoint and pointer stored in the node_comm_info array. */
 145     node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
 146 
 147     peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
 148 
 149     if (registration_handle_size) {
 150         peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
 151     }
 152 
 153     peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id));
 154     if (OPAL_UNLIKELY(NULL == peer->state_endpoint)) {
 155         return OPAL_ERR_UNREACH;
 156     }
 157 
 158     /* nothing more to do for dynamic memory windows */
 159     if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
 160         return OMPI_SUCCESS;
 161     }
 162 
 163     /* read window data from the target rank */
 164     if (module->same_disp_unit) {
 165         /* do not bother reading the displacement unit as it is already known */
 166         peer_data_offset = offsetof (ompi_osc_rdma_state_t, regions);
 167     } else {
 168         peer_data_offset = offsetof (ompi_osc_rdma_state_t, disp_unit);
 169     }
 170 
 171     peer_data_size = module->state_size - peer_data_offset;
 172     peer_data = alloca (peer_data_size);
 173 
 174     /* read window data from the end of the target's state structure */
 175     ret = ompi_osc_get_data_blocking (module, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle,
 176                                       peer_data, peer_data_size);
 177     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 178         return ret;
 179     }
 180 
 181     if (!module->same_disp_unit) {
 182         /* unpack displacement */
 183         memcpy (&ex_peer->disp_unit, peer_data, sizeof (ex_peer->disp_unit));
 184         peer_data += offsetof (ompi_osc_rdma_state_t, regions) - offsetof (ompi_osc_rdma_state_t, disp_unit);
 185         disp_unit = ex_peer->disp_unit;
 186     } else {
 187         disp_unit = module->disp_unit;
 188     }
 189 
 190     ompi_osc_rdma_region_t *base_region = (ompi_osc_rdma_region_t *) peer_data;
 191 
 192     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "peer %d: remote base region: 0x%" PRIx64 ", size: %" PRId64
 193                      ", flags: 0x%x, disp_unit: %d", peer->rank, base_region->base, base_region->len,
 194                      peer->flags, disp_unit);
 195     (void)disp_unit;  // silence compiler warning
 196 
 197     if (ompi_osc_rdma_peer_local_base (peer)) {
 198         /* for now we store the local address in the standard place. do no overwrite it */
 199         return OMPI_SUCCESS;
 200     }
 201 
 202     ex_peer->super.base = base_region->base;
 203 
 204     /* save size and base */
 205     if (!module->same_size) {
 206         ex_peer->size = base_region->len;
 207     }
 208 
 209     if (base_region->len) {
 210         if (registration_handle_size) {
 211             ex_peer->super.base_handle = malloc (registration_handle_size);
 212             if (OPAL_UNLIKELY(NULL == ex_peer->super.base_handle)) {
 213                 return OMPI_ERR_OUT_OF_RESOURCE;
 214             }
 215 
 216             peer->flags |= OMPI_OSC_RDMA_PEER_BASE_FREE;
 217 
 218             memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size);
 219         }
 220 
 221         if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
 222             ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint;
 223         }
 224     }
 225 
 226     return OMPI_SUCCESS;
 227 }
 228 
 229 /**
 230  * @brief lookup (or allocate) a peer for a rank (internal)
 231  *
 232  * @param[in] module         osc rdma module
 233  * @param[in] peer_id        rank of remote peer (in module communicator)
 234  *
 235  * @returns peer object on success
 236  * @returns NULL on error
 237  *
 238  * This is an internal function for looking up or allocating a peer object for a window rank. This
 239  * function requires the peer lock to be held and is only expected to be called from itself or
 240  * the ompi_osc_rdma_peer_lookup() helper function.
 241  */
 242 static struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup_internal (struct ompi_osc_rdma_module_t *module, int peer_id)
 243 {
 244     ompi_osc_rdma_peer_t *peer;
 245     int ret;
 246 
 247     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "looking up peer data for rank %d", peer_id);
 248 
 249     peer = ompi_osc_module_get_peer (module, peer_id);
 250     if (NULL != peer) {
 251         return peer;
 252     }
 253 
 254     ret = ompi_osc_rdma_new_peer (module, peer_id, &peer);
 255     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 256         return NULL;
 257     }
 258 
 259     ret = ompi_osc_rdma_peer_setup (module, peer);
 260     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 261         OBJ_RELEASE(peer);
 262         return NULL;
 263     }
 264 
 265     ret = ompi_osc_module_add_peer (module, peer);
 266     if (OPAL_SUCCESS != ret) {
 267         /* out of memory */
 268         OBJ_RELEASE(peer);
 269         return NULL;
 270     }
 271 
 272     /* ensure the peer hash is updated before we drop the lock */
 273     opal_atomic_wmb ();
 274 
 275     return peer;
 276 }
 277 
 278 struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id)
 279 {
 280     struct ompi_osc_rdma_peer_t *peer;
 281 
 282     opal_mutex_lock (&module->peer_lock);
 283     peer = ompi_osc_rdma_peer_lookup_internal (module, peer_id);
 284     opal_mutex_unlock (&module->peer_lock);
 285 
 286     return peer;
 287 }
 288 
 289 
 290 /******* peer objects *******/
 291 
 292 static void ompi_osc_rdma_peer_construct (ompi_osc_rdma_peer_t *peer)
 293 {
 294     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
 295 }
 296 
 297 static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer)
 298 {
 299     if (peer->state_handle && (peer->flags & OMPI_OSC_RDMA_PEER_STATE_FREE)) {
 300         free (peer->state_handle);
 301     }
 302 }
 303 
 304 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_list_item_t,
 305                    ompi_osc_rdma_peer_construct,
 306                    ompi_osc_rdma_peer_destruct);
 307 
 308 static void ompi_osc_rdma_peer_basic_construct (ompi_osc_rdma_peer_basic_t *peer)
 309 {
 310     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
 311 }
 312 
 313 static void ompi_osc_rdma_peer_basic_destruct (ompi_osc_rdma_peer_basic_t *peer)
 314 {
 315     if (peer->base_handle && (peer->super.flags & OMPI_OSC_RDMA_PEER_BASE_FREE)) {
 316         free (peer->base_handle);
 317     }
 318 }
 319 
 320 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_basic_t, ompi_osc_rdma_peer_t,
 321                    ompi_osc_rdma_peer_basic_construct,
 322                    ompi_osc_rdma_peer_basic_destruct);
 323 
 324 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_extended_t, ompi_osc_rdma_peer_basic_t,
 325                    NULL, NULL);
 326 
 327 static void ompi_osc_rdma_peer_dynamic_construct (ompi_osc_rdma_peer_dynamic_t *peer)
 328 {
 329     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
 330 }
 331 
 332 static void ompi_osc_rdma_peer_dynamic_destruct (ompi_osc_rdma_peer_dynamic_t *peer)
 333 {
 334     if (peer->regions) {
 335         free (peer->regions);
 336     }
 337 }
 338 
 339 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_dynamic_t, ompi_osc_rdma_peer_t,
 340                    ompi_osc_rdma_peer_dynamic_construct,
 341                    ompi_osc_rdma_peer_dynamic_destruct);

/* [<][>][^][v][top][bottom][index][help] */