root/opal/mca/btl/uct/btl_uct_endpoint.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_btl_uct_endpoint_construct
  2. mca_btl_uct_endpoint_destruct
  3. mca_btl_uct_endpoint_create
  4. mca_btl_uct_process_modex_tl
  5. mca_btl_uct_process_modex
  6. mca_btl_uct_ep_create_connected_compat
  7. mca_btl_uct_ep_create_compat
  8. mca_btl_uct_endpoint_connect_iface
  9. mca_btl_uct_connection_ep_construct
  10. mca_btl_uct_connection_ep_destruct
  11. mca_btl_uct_endpoint_flush_complete
  12. mca_btl_uct_endpoint_send_conn_req
  13. mca_btl_uct_endpoint_connect_endpoint
  14. mca_btl_uct_endpoint_connect

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2018      Los Alamos National Security, LLC. All rights
   4  *                         reserved.
   5  * Copyright (c) 2018      Triad National Security, LLC. All rights
   6  *                         reserved.
   7  * Copyright (c) 2019      Google, LLC. All rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 
  15 #include "btl_uct.h"
  16 #include "btl_uct_endpoint.h"
  17 #include "btl_uct_device_context.h"
  18 #include "btl_uct_am.h"
  19 #include "opal/util/proc.h"
  20 
  21 static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint)
  22 {
  23     memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module);
  24     endpoint->conn_ep = NULL;
  25     OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t);
  26 }
  27 
  28 static void mca_btl_uct_endpoint_destruct (mca_btl_uct_endpoint_t *endpoint)
  29 {
  30     for (int tl_index = 0 ; tl_index < 2 ; ++tl_index) {
  31         for (int i = 0 ; i < mca_btl_uct_component.num_contexts_per_module ; ++i) {
  32             if (NULL != endpoint->uct_eps[i][tl_index].uct_ep) {
  33                 uct_ep_destroy (endpoint->uct_eps[i][tl_index].uct_ep);
  34             }
  35         }
  36     }
  37 
  38     OBJ_DESTRUCT(&endpoint->ep_lock);
  39 }
  40 
  41 OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t,
  42                    mca_btl_uct_endpoint_construct,
  43                    mca_btl_uct_endpoint_destruct);
  44 
  45 mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc)
  46 {
  47     mca_btl_uct_endpoint_t *endpoint = calloc (1, sizeof (*endpoint) + sizeof (endpoint->uct_eps[0]) *
  48                                                mca_btl_uct_component.num_contexts_per_module);
  49 
  50     if (OPAL_UNLIKELY(NULL == endpoint)) {
  51         return NULL;
  52     }
  53 
  54     OBJ_CONSTRUCT(endpoint, mca_btl_uct_endpoint_t);
  55     endpoint->ep_proc = proc;
  56 
  57     return (mca_btl_base_endpoint_t *) endpoint;
  58 }
  59 
  60 static unsigned char *mca_btl_uct_process_modex_tl (unsigned char *modex_data)
  61 {
  62     BTL_VERBOSE(("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data)));
  63 
  64     /* skip size and name */
  65     return modex_data + 4 + strlen ((char *) modex_data + 4) + 1;
  66 }
  67 
  68 static void mca_btl_uct_process_modex (mca_btl_uct_module_t *uct_btl, unsigned char *modex_data,
  69                                        unsigned char **rdma_tl_data, unsigned char **am_tl_data,
  70                                        unsigned char **conn_tl_data)
  71 {
  72     BTL_VERBOSE(("processing remote modex data"));
  73 
  74     if (uct_btl->rdma_tl) {
  75         BTL_VERBOSE(("modex contains RDMA data"));
  76         if (rdma_tl_data) {
  77             *rdma_tl_data = mca_btl_uct_process_modex_tl (modex_data);
  78         }
  79         modex_data += *((uint32_t *) modex_data);
  80     } else if (rdma_tl_data) {
  81         *rdma_tl_data = NULL;
  82     }
  83 
  84     if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) {
  85         BTL_VERBOSE(("modex contains active message data"));
  86         if (am_tl_data) {
  87             *am_tl_data = mca_btl_uct_process_modex_tl (modex_data);
  88         }
  89         modex_data += *((uint32_t *) modex_data);
  90     } else if (am_tl_data) {
  91         *am_tl_data = NULL;
  92     }
  93 
  94     if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl && uct_btl->conn_tl != uct_btl->am_tl) {
  95         BTL_VERBOSE(("modex contains connection data"));
  96         if (conn_tl_data) {
  97             *conn_tl_data = mca_btl_uct_process_modex_tl (modex_data);
  98         }
  99         modex_data += *((uint32_t *) modex_data);
 100     } else if (conn_tl_data) {
 101         *conn_tl_data = NULL;
 102     }
 103 }
 104 
 105 static inline ucs_status_t mca_btl_uct_ep_create_connected_compat (uct_iface_h iface, uct_device_addr_t *device_addr,
 106                                                                    uct_iface_addr_t *iface_addr, uct_ep_h *uct_ep)
 107 {
 108 #if UCT_API >= UCT_VERSION(1, 6)
 109     uct_ep_params_t ep_params = {.field_mask = UCT_EP_PARAM_FIELD_IFACE | UCT_EP_PARAM_FIELD_DEV_ADDR | UCT_EP_PARAM_FIELD_IFACE_ADDR,
 110                                  .iface = iface, .dev_addr = device_addr, .iface_addr = iface_addr};
 111     return uct_ep_create (&ep_params, uct_ep);
 112 #else
 113     return uct_ep_create_connected (iface, device_addr, iface_addr, uct_ep);
 114 #endif
 115 }
 116 
 117 static inline ucs_status_t mca_btl_uct_ep_create_compat (uct_iface_h iface, uct_ep_h *uct_ep)
 118 {
 119 #if UCT_API >= UCT_VERSION(1, 6)
 120     uct_ep_params_t ep_params = {.field_mask = UCT_EP_PARAM_FIELD_IFACE, .iface = iface};
 121     return uct_ep_create (&ep_params, uct_ep);
 122 #else
 123     return uct_ep_create (iface, uct_ep);
 124 #endif
 125 }
 126 
 127 static int mca_btl_uct_endpoint_connect_iface (mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *tl,
 128                                                mca_btl_uct_device_context_t *tl_context,
 129                                                mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data)
 130 {
 131     uct_device_addr_t *device_addr = NULL;
 132     uct_iface_addr_t *iface_addr;
 133     ucs_status_t ucs_status;
 134 
 135     /* easy case. just connect to the interface */
 136     iface_addr = (uct_iface_addr_t *) tl_data;
 137     device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).iface_addr_len);
 138 
 139     BTL_VERBOSE(("connecting endpoint to interface"));
 140 
 141     mca_btl_uct_context_lock (tl_context);
 142     ucs_status = mca_btl_uct_ep_create_connected_compat (tl_context->uct_iface, device_addr, iface_addr, &tl_endpoint->uct_ep);
 143     tl_endpoint->flags = MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY;
 144     mca_btl_uct_context_unlock (tl_context);
 145 
 146     return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR;
 147 }
 148 
 149 static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t *ep)
 150 {
 151     ep->uct_ep = NULL;
 152 }
 153 
 154 static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep)
 155 {
 156     if (ep->uct_ep) {
 157         uct_ep_destroy (ep->uct_ep);
 158         ep->uct_ep = NULL;
 159     }
 160 }
 161 
 162 OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct,
 163                    mca_btl_uct_connection_ep_destruct);
 164 
 165 struct mca_btl_uct_conn_completion_t {
 166     uct_completion_t super;
 167     volatile bool complete;
 168 };
 169 typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t;
 170 
 171 static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status)
 172 {
 173     mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self;
 174     BTL_VERBOSE(("connection flush complete"));
 175     completion->complete = true;
 176 }
 177 
 178 static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
 179                                                mca_btl_uct_device_context_t *conn_tl_context,
 180                                                mca_btl_uct_conn_req_t *request, size_t request_length)
 181 {
 182     mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
 183     mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete},
 184                                                 .complete = false};
 185     ucs_status_t ucs_status;
 186 
 187     BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
 188                  request->context_id, request->type, request_length));
 189 
 190     OBJ_RETAIN(endpoint->conn_ep);
 191 
 192     /* need to drop the lock to avoid hold-and-wait */
 193     opal_mutex_unlock (&endpoint->ep_lock);
 194 
 195     do {
 196         MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
 197                 ucs_status = uct_ep_am_short (conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request,
 198                                               request_length);
 199             });
 200         if (OPAL_LIKELY(UCS_OK == ucs_status)) {
 201             break;
 202         }
 203 
 204         if (OPAL_UNLIKELY(UCS_ERR_NO_RESOURCE != ucs_status)) {
 205             return OPAL_ERROR;
 206         }
 207 
 208         /* some TLs (UD for example) need to be progressed to get resources */
 209         mca_btl_uct_context_progress (conn_tl_context);
 210     } while (1);
 211 
 212     /* for now we just wait for the connection request to complete before continuing */
 213     ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super);
 214     if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
 215         /* NTH: I don't know if this path is needed. For some networks we must use a completion. */
 216         do {
 217             ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
 218             mca_btl_uct_context_progress (conn_tl_context);
 219         } while (UCS_INPROGRESS == ucs_status);
 220     } else {
 221         do {
 222             mca_btl_uct_context_progress (conn_tl_context);
 223         } while (!completion.complete);
 224     }
 225 
 226     opal_mutex_lock (&endpoint->ep_lock);
 227 
 228     OBJ_RELEASE(endpoint->conn_ep);
 229 
 230     return OPAL_SUCCESS;
 231 }
 232 
 233 static int mca_btl_uct_endpoint_connect_endpoint (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
 234                                                   mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context,
 235                                                   mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data,
 236                                                   uint8_t *conn_tl_data, void *ep_addr)
 237 {
 238     size_t request_length = sizeof (mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
 239     mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
 240     mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
 241     mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
 242     mca_btl_uct_conn_req_t *request = alloca (request_length);
 243     uct_device_addr_t *device_addr = NULL;
 244     uct_iface_addr_t *iface_addr;
 245     ucs_status_t ucs_status;
 246     int rc;
 247 
 248     assert (NULL != conn_tl);
 249 
 250     BTL_VERBOSE(("connecting endpoint to remote endpoint"));
 251 
 252     if (NULL == conn_ep) {
 253         BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p",
 254                      opal_process_name_print (endpoint->ep_proc->proc_name)));
 255 
 256         iface_addr = (uct_iface_addr_t *) conn_tl_data;
 257         device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len);
 258 
 259         endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t);
 260         if (OPAL_UNLIKELY(NULL == conn_ep)) {
 261             return OPAL_ERR_OUT_OF_RESOURCE;
 262         }
 263 
 264         /* create a temporary endpoint for setting up the rdma endpoint */
 265         MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
 266                 ucs_status = mca_btl_uct_ep_create_connected_compat (conn_tl_context->uct_iface, device_addr, iface_addr,
 267                                                                      &conn_ep->uct_ep);
 268             });
 269         if (UCS_OK != ucs_status) {
 270             BTL_VERBOSE(("could not create an endpoint for forming connection to remote peer. code = %d",
 271                          ucs_status));
 272             return OPAL_ERROR;
 273         }
 274     } else {
 275         OBJ_RETAIN(conn_ep);
 276     }
 277 
 278     /* fill in common request parameters */
 279     request->proc_name = OPAL_PROC_MY_NAME;
 280     request->context_id = tl_context->context_id;
 281     request->tl_index = tl->tl_index;
 282     request->type = !!(ep_addr);
 283 
 284     if (NULL == tl_endpoint->uct_ep) {
 285         BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data",
 286                      opal_process_name_print (endpoint->ep_proc->proc_name)));
 287 
 288         MCA_BTL_UCT_CONTEXT_SERIALIZE(tl_context, {
 289                 ucs_status = mca_btl_uct_ep_create_compat (tl_context->uct_iface, &tl_endpoint->uct_ep);
 290             });
 291         if (UCS_OK != ucs_status) {
 292             OBJ_RELEASE(endpoint->conn_ep);
 293             return OPAL_ERROR;
 294         }
 295     }
 296 
 297     if (ep_addr) {
 298         BTL_VERBOSE(("using remote endpoint address to connect endpoint for tl %s, index %d. ep_addr = %p",
 299                      tl->uct_tl_name, tl_context->context_id, ep_addr));
 300 
 301         /* NTH: there is no need to lock the device context in this case */
 302         ucs_status = uct_ep_connect_to_ep (tl_endpoint->uct_ep, (uct_device_addr_t *) tl_data, ep_addr);
 303         if (UCS_OK != ucs_status) {
 304             return OPAL_ERROR;
 305         }
 306     }
 307 
 308     /* fill in connection request */
 309     ucs_status = uct_ep_get_address (tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
 310     if (UCS_OK != ucs_status) {
 311         /* this is a fatal a fatal error */
 312         OBJ_RELEASE(endpoint->conn_ep);
 313         uct_ep_destroy (tl_endpoint->uct_ep);
 314         tl_endpoint->uct_ep = NULL;
 315         return OPAL_ERROR;
 316     }
 317 
 318     /* let the remote side know that the connection has been established and
 319      * wait for the message to be sent */
 320     rc = mca_btl_uct_endpoint_send_conn_req (uct_btl, endpoint, conn_tl_context, request, request_length);
 321     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 322         OBJ_RELEASE(endpoint->conn_ep);
 323         uct_ep_destroy (tl_endpoint->uct_ep);
 324         tl_endpoint->uct_ep = NULL;
 325         return OPAL_ERROR;
 326     }
 327 
 328     return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS : OPAL_ERR_OUT_OF_RESOURCE;
 329 }
 330 
 331 int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id,
 332                                   void *ep_addr, int tl_index)
 333 {
 334     mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index;
 335     mca_btl_uct_tl_t *tl = (uct_btl->rdma_tl && tl_index == uct_btl->rdma_tl->tl_index) ?
 336         uct_btl->rdma_tl : uct_btl->am_tl;
 337     mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id);
 338     uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
 339     mca_btl_uct_connection_ep_t *conn_ep = NULL;
 340     mca_btl_uct_modex_t *modex;
 341     uint8_t *modex_data;
 342     size_t msg_size;
 343     int rc;
 344 
 345     /* only two types of endpoints at this time */
 346     assert (tl_index < 2);
 347 
 348     if (OPAL_UNLIKELY(NULL == tl)) {
 349         return OPAL_ERR_UNREACH;
 350     }
 351 
 352     BTL_VERBOSE(("checking endpoint %p with context id %d. cached uct ep: %p, ready: %d", (void *) endpoint, context_id,
 353                  (void *) tl_endpoint->uct_ep, !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags)));
 354 
 355     opal_mutex_lock (&endpoint->ep_lock);
 356     if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) {
 357         opal_mutex_unlock (&endpoint->ep_lock);
 358         /* nothing more to do. someone else completed the connection */
 359         return OPAL_SUCCESS;
 360     }
 361 
 362     /* dumpicate connection request. nothing to do until the endpoint data is received */
 363     if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) {
 364         opal_mutex_unlock (&endpoint->ep_lock);
 365         return OPAL_ERR_OUT_OF_RESOURCE;
 366     }
 367 
 368     do {
 369         /* read the modex. this is done both to start the connection and to process endpoint data */
 370         OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version,
 371                         &endpoint->ep_proc->proc_name, (void **)&modex, &msg_size);
 372         if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
 373             BTL_ERROR(("error receiving modex"));
 374             break;
 375         }
 376 
 377         BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size,
 378                      OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count));
 379         modex_data = modex->data;
 380 
 381         /* look for matching transport in the modex */
 382         for (int i = 0 ; i < modex->module_count ; ++i) {
 383             uint32_t modex_size = *((uint32_t *) modex_data);
 384 
 385             BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name));
 386 
 387             modex_data += 4;
 388 
 389             if (0 != strcmp ((char *) modex_data, uct_btl->md_name)) {
 390                 /* modex belongs to a different module, skip it and continue */
 391                 modex_data += modex_size - 4;
 392                 continue;
 393             }
 394 
 395             modex_data += strlen ((char *) modex_data) + 1;
 396 
 397             mca_btl_uct_process_modex (uct_btl, modex_data, &rdma_tl_data, &am_tl_data, &conn_tl_data);
 398             break;
 399         }
 400 
 401         tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data;
 402 
 403         if (NULL == tl_data) {
 404             opal_mutex_unlock (&endpoint->ep_lock);
 405             return OPAL_ERR_UNREACH;
 406         }
 407 
 408         /* connect the endpoint */
 409         if (!mca_btl_uct_tl_requires_connection_tl (tl)) {
 410             rc = mca_btl_uct_endpoint_connect_iface (uct_btl, tl, tl_context, tl_endpoint, tl_data);
 411         } else {
 412             rc = mca_btl_uct_endpoint_connect_endpoint (uct_btl, endpoint, tl, tl_context, tl_endpoint,
 413                                                         tl_data, conn_tl_data, ep_addr);
 414         }
 415 
 416     } while (0);
 417 
 418     /* to avoid a possible hold-and wait deadlock. destroy the endpoint after dropping the endpoint lock. */
 419     if (endpoint->conn_ep && 1 == endpoint->conn_ep->super.obj_reference_count) {
 420         conn_ep = endpoint->conn_ep;
 421         endpoint->conn_ep = NULL;
 422     }
 423 
 424     opal_mutex_unlock (&endpoint->ep_lock);
 425 
 426     if (conn_ep) {
 427         OBJ_RELEASE(conn_ep);
 428     }
 429 
 430     BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet"));
 431 
 432     return rc;
 433 }

/* [<][>][^][v][top][bottom][index][help] */