root/ompi/mca/osc/rdma/osc_rdma_dynamic.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_osc_rdma_find_region_containing
  2. find_insertion_point
  3. ompi_osc_rdma_attach
  4. ompi_osc_rdma_detach
  5. ompi_osc_rdma_refresh_dynamic_region
  6. ompi_osc_rdma_find_dynamic_region

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2014-2016 Los Alamos National Security, LLC.  All rights
   4  *                         reserved.
   5  * $COPYRIGHT$
   6  *
   7  * Additional copyrights may follow
   8  *
   9  * $HEADER$
  10  */
  11 
  12 #include "osc_rdma_comm.h"
  13 #include "osc_rdma_lock.h"
  14 
  15 #include "mpi.h"
  16 
  17 #include "opal/util/sys_limits.h"
  18 
  19 /**
  20  * ompi_osc_rdma_find_region_containing:
  21  *
  22  * @param[in]  regions      sorted list of regions
  23  * @param[in]  min_index    minimum index to search (call with 0)
  24  * @param[in]  max_index    maximum index to search (call with length - 1)
  25  * @param[in]  base         base of region to search for
  26  * @param[in]  bound        bound of region to search for
  27  * @param[in]  region_size  size of an ompi_osc_rdma_region_t object
  28  * @param[out] region_index index of region if found (may be NULL)
  29  *
  30  * @returns an index on success or -1 on failure
  31  *
  32  * This function searches through a sorted list of rdma regions {regions} and finds
  33  * the region that contains the region specified by {base} and {bound}. If a
  34  * matching region is found the index of that region is returned else the function
  35  * returns -1.
  36  */
  37 static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi_osc_rdma_region_t *regions, int min_index,
  38                                                                             int max_index, intptr_t base, intptr_t bound,
  39                                                                             size_t region_size, int *region_index)
  40 {
  41     int mid_index = (max_index + min_index) >> 1;
  42     ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
  43     intptr_t region_bound;
  44 
  45     if (min_index > max_index) {
  46         return NULL;
  47     }
  48 
  49     region_bound = (intptr_t) (region->base + region->len);
  50 
  51     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
  52                          (void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
  53                          min_index, max_index);
  54 
  55     if (region->base > base) {
  56         return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
  57     } else if (bound <= region_bound) {
  58         if (region_index) {
  59             *region_index = mid_index;
  60         }
  61 
  62         return region;
  63     }
  64 
  65     return ompi_osc_rdma_find_region_containing (regions, mid_index+1, max_index, base, bound, region_size, region_index);
  66 }
  67 
  68 /* binary search for insertion point */
  69 static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
  70                                                      size_t region_size, int *region_index)
  71 {
  72     int mid_index = (max_index + min_index) >> 1;
  73     ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
  74 
  75     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
  76 
  77     if (max_index < min_index) {
  78         *region_index = min_index;
  79         return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
  80     }
  81 
  82     if (region->base > base) {
  83         return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
  84     } else {
  85         return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
  86     }
  87 }
  88 
  89 int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
  90 {
  91     ompi_osc_rdma_module_t *module = GET_MODULE(win);
  92     const int my_rank = ompi_comm_rank (module->comm);
  93     ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
  94     ompi_osc_rdma_region_t *region;
  95     osc_rdma_counter_t region_count;
  96     osc_rdma_counter_t region_id;
  97     void *bound;
  98     intptr_t page_size = opal_getpagesize ();
  99     int region_index;
 100     int ret;
 101 
 102     if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
 103         return OMPI_ERR_RMA_FLAVOR;
 104     }
 105 
 106     if (0 == len) {
 107         /* shot-circuit 0-byte case */
 108         return OMPI_SUCCESS;
 109     }
 110 
 111     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: %s, %p, %lu", win->w_name, base, (unsigned long) len);
 112 
 113     OPAL_THREAD_LOCK(&module->lock);
 114 
 115     region_count = module->state->region_count & 0xffffffffL;
 116     region_id    = module->state->region_count >> 32;
 117 
 118     if (region_count == mca_osc_rdma_component.max_attach) {
 119         OPAL_THREAD_UNLOCK(&module->lock);
 120         return OMPI_ERR_RMA_ATTACH;
 121     }
 122 
 123     /* it is wasteful to register less than a page. this may allow the remote side to access more
 124      * memory but the MPI standard covers this with calling the calling behavior erroneous */
 125     bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
 126     base = (void *)((intptr_t) base & ~(page_size - 1));
 127     len = (size_t)((intptr_t) bound - (intptr_t) base);
 128 
 129     /* see if a matching region already exists */
 130     region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
 131                                                    (intptr_t) bound, module->region_size, &region_index);
 132     if (NULL != region) {
 133         ++module->dynamic_handles[region_index].refcnt;
 134         OPAL_THREAD_UNLOCK(&module->lock);
 135         /* no need to invalidate remote caches */
 136         return OMPI_SUCCESS;
 137     }
 138 
 139     /* region is in flux */
 140     module->state->region_count = -1;
 141     opal_atomic_wmb ();
 142 
 143     ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
 144 
 145     /* do a binary seach for where the region should be inserted */
 146     if (region_count) {
 147         region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
 148                                        module->region_size, &region_index);
 149 
 150         if (region_index < region_count) {
 151             memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
 152 
 153             if (module->selected_btl->btl_register_mem) {
 154                 memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
 155                          (region_count - region_index) * sizeof (module->dynamic_handles[0]));
 156             }
 157         }
 158     } else {
 159         region_index = 0;
 160         region = (ompi_osc_rdma_region_t *) module->state->regions;
 161     }
 162 
 163     region->base = (intptr_t) base;
 164     region->len  = len;
 165 
 166     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
 167                      base, (void *)((intptr_t) base + len), region_index);
 168 
 169     if (module->selected_btl->btl_register_mem) {
 170         mca_btl_base_registration_handle_t *handle;
 171 
 172         ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
 173                                       &handle);
 174         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 175             OPAL_THREAD_UNLOCK(&module->lock);
 176             return OMPI_ERR_RMA_ATTACH;
 177         }
 178 
 179         memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
 180         module->dynamic_handles[region_index].btl_handle = handle;
 181     } else {
 182         module->dynamic_handles[region_index].btl_handle = NULL;
 183     }
 184 
 185     module->dynamic_handles[region_index].refcnt = 1;
 186 
 187 #if OPAL_ENABLE_DEBUG
 188     for (int i = 0 ; i < region_count + 1 ; ++i) {
 189         region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + i * module->region_size);
 190 
 191         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, " dynamic region %d: {%p, %lu}", i,
 192                          (void *) region->base, (unsigned long) region->len);
 193     }
 194 #endif
 195 
 196     opal_atomic_mb ();
 197     /* the region state has changed */
 198     module->state->region_count = ((region_id + 1) << 32) | (region_count + 1);
 199 
 200     ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
 201     OPAL_THREAD_UNLOCK(&module->lock);
 202 
 203     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach complete");
 204 
 205     return OMPI_SUCCESS;
 206 }
 207 
 208 
 209 int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
 210 {
 211     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 212     const int my_rank = ompi_comm_rank (module->comm);
 213     ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
 214     osc_rdma_counter_t region_count, region_id;
 215     ompi_osc_rdma_region_t *region;
 216     int region_index;
 217 
 218     if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
 219         return OMPI_ERR_WIN;
 220     }
 221 
 222     OPAL_THREAD_LOCK(&module->lock);
 223 
 224     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);
 225 
 226     /* the upper 4 bytes of the region count are an instance counter */
 227     region_count = module->state->region_count & 0xffffffffL;
 228     region_id    = module->state->region_count >> 32;
 229 
 230     region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
 231                                                    region_count - 1, (intptr_t) base, (intptr_t) base + 1,
 232                                                    module->region_size, &region_index);
 233     if (NULL == region) {
 234         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
 235         OPAL_THREAD_UNLOCK(&module->lock);
 236         return OMPI_ERROR;
 237     }
 238 
 239     if (--module->dynamic_handles[region_index].refcnt > 0) {
 240         OPAL_THREAD_UNLOCK(&module->lock);
 241         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
 242         return OMPI_SUCCESS;
 243     }
 244 
 245     /* lock the region so it can't change while a peer is reading it */
 246     ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
 247 
 248     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "detaching dynamic memory region {%p, %p} from index %d",
 249                      base, (void *)((intptr_t) base + region->len), region_index);
 250 
 251     if (module->selected_btl->btl_register_mem) {
 252         ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
 253 
 254         if (region_index < region_count - 1) {
 255             memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
 256                      (region_count - region_index - 1) * sizeof (void *));
 257         }
 258 
 259         memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
 260     }
 261 
 262     if (region_index < region_count - 1) {
 263         memmove (region, (void *)((intptr_t) region + module->region_size),
 264                  (region_count - region_index - 1) * module->region_size);;
 265     }
 266 
 267     module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
 268 
 269     ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
 270 
 271     OPAL_THREAD_UNLOCK(&module->lock);
 272 
 273     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
 274 
 275     return OMPI_SUCCESS;
 276 }
 277 
 278 /**
 279  * @brief refresh the local view of the dynamic memory region
 280  *
 281  * @param[in] module         osc rdma module
 282  * @param[in] peer           peer object to refresh
 283  *
 284  * This function does the work of keeping the local view of a remote peer in sync with what is attached
 285  * to the remote window. It is called on every address translation since there is no way (currently) to
 286  * detect that the attached regions have changed. To reduce the amount of data read we first read the
 287  * region count (which contains an id). If that hasn't changed the region data is not updated. If the
 288  * list of attached regions has changed then all valid regions are read from the peer while holding
 289  * their region lock.
 290  */
 291 static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_dynamic_t *peer) {
 292     osc_rdma_counter_t region_count, region_id;
 293     uint64_t source_address;
 294     int ret;
 295 
 296     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "refreshing dynamic memory regions for target %d", peer->super.rank);
 297 
 298     /* this loop is meant to prevent us from reading data while the remote side is in attach */
 299     do {
 300         osc_rdma_counter_t remote_value;
 301 
 302         source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, region_count);
 303         ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
 304                                           &remote_value, sizeof (remote_value));
 305         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 306             return ret;
 307         }
 308 
 309         region_id = remote_value >> 32;
 310         region_count = remote_value & 0xffffffffl;
 311         /* check if the region is changing */
 312     } while (0xffffffffl == region_count);
 313 
 314     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "target region: id 0x%lx, count 0x%lx (cached: 0x%x, 0x%x)",
 315                      (unsigned long) region_id, (unsigned long) region_count, peer->region_id, peer->region_count);
 316 
 317     if (0 == region_count) {
 318         return OMPI_ERR_RMA_RANGE;
 319     }
 320 
 321     /* check if the cached copy is out of date */
 322     OPAL_THREAD_LOCK(&module->lock);
 323 
 324     if (peer->region_id != region_id) {
 325         unsigned region_len = module->region_size * region_count;
 326         void *temp;
 327 
 328         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "dynamic memory cache is out of data. reloading from peer");
 329 
 330         /* allocate only enough space for the remote regions */
 331         temp = realloc (peer->regions, region_len);
 332         if (NULL == temp) {
 333             OPAL_THREAD_UNLOCK(&module->lock);
 334             return OMPI_ERR_OUT_OF_RESOURCE;
 335         }
 336         peer->regions = temp;
 337 
 338         /* lock the region */
 339         ompi_osc_rdma_lock_acquire_shared (module, &peer->super, 1, offsetof (ompi_osc_rdma_state_t, regions_lock),
 340                                            OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
 341 
 342         source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions);
 343         ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
 344                                           peer->regions, region_len);
 345         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
 346             OPAL_THREAD_UNLOCK(&module->lock);
 347             return ret;
 348         }
 349 
 350         /* release the region lock */
 351         ompi_osc_rdma_lock_release_shared (module, &peer->super, -1, offsetof (ompi_osc_rdma_state_t, regions_lock));
 352 
 353         /* update cached region ids */
 354         peer->region_id = region_id;
 355         peer->region_count = region_count;
 356     }
 357 
 358     OPAL_THREAD_UNLOCK(&module->lock);
 359 
 360     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished refreshing dynamic memory regions for target %d", peer->super.rank);
 361 
 362     return OMPI_SUCCESS;
 363 }
 364 
 365 int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t base, size_t len,
 366                                        ompi_osc_rdma_region_t **region)
 367 {
 368     ompi_osc_rdma_peer_dynamic_t *dy_peer = (ompi_osc_rdma_peer_dynamic_t *) peer;
 369     intptr_t bound = (intptr_t) base + len;
 370     ompi_osc_rdma_region_t *regions;
 371     int ret, region_count;
 372 
 373     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "locating dynamic memory region matching: {%" PRIx64 ", %" PRIx64 "}"
 374                      " (len %lu)", base, base + len, (unsigned long) len);
 375 
 376     if (!ompi_osc_rdma_peer_local_state (peer)) {
 377         ret = ompi_osc_rdma_refresh_dynamic_region (module, dy_peer);
 378         if (OMPI_SUCCESS != ret) {
 379             return ret;
 380         }
 381 
 382         regions = dy_peer->regions;
 383         region_count = dy_peer->region_count;
 384     } else {
 385         ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) peer->state;
 386         regions = (ompi_osc_rdma_region_t *) peer_state->regions;
 387         region_count = peer_state->region_count;
 388     }
 389 
 390     *region = ompi_osc_rdma_find_region_containing (regions, 0, region_count - 1, (intptr_t) base, bound, module->region_size, NULL);
 391     if (!*region) {
 392         return OMPI_ERR_RMA_RANGE;
 393     }
 394 
 395     /* round a matching region */
 396     return OMPI_SUCCESS;
 397 }

/* [<][>][^][v][top][bottom][index][help] */