1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2006 The Regents of the University of California. 4 * All rights reserved. 5 * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. 6 * Copyright (c) 2015 Los Alamos National Security, LLC. All rights 7 * reserved. 8 * Copyright (c) 2017 Intel, Inc. All rights reserved 9 * $COPYRIGHT$ 10 * 11 * Additional copyrights may follow 12 * 13 * $HEADER$ 14 */ 15 16 /** 17 * @file 18 * 19 * Matching Transport Layer 20 * 21 * The Matching Transport Layer (MTL) provides device-layer support 22 * for transfer of MPI point-to-point messages over devices that 23 * support hardware / library message matching. This layer is used 24 * with the MTL PML component to provide lowest latency and highest 25 * bandwidth on given architectures. Features found in other PML 26 * interfaces, such as message fragmenting, multi-device support, and 27 * NIC failover are not provided by the upper layers. 28 * 29 * In general, this interface should not be used for transport layer 30 * support. Instead, the BTL interface should be used. The BTL 31 * interface allows for multiplexing between multiple users 32 * (point-to-point, one-sided, etc.) and provides many features not 33 * found in this interface (RDMA from arbitrary buffers, active 34 * messaging, reasonable pinned memory caching, etc.) 35 */ 36 37 #ifndef OMPI_MTL_H 38 #define OMPI_MTL_H 39 40 #include "ompi_config.h" 41 #include "mpi.h" /* needed for MPI_ANY_TAG */ 42 #include "ompi/mca/mca.h" 43 #include "ompi/mca/pml/pml_constants.h" /* for send_mode enum */ 44 #include "ompi/request/request.h" 45 46 BEGIN_C_DECLS 47 48 struct ompi_request_t; 49 struct opal_convertor_t; 50 51 struct mca_mtl_base_module_t; 52 53 struct mca_mtl_request_t { 54 /** pointer to associated ompi_request_t */ 55 struct ompi_request_t *ompi_req; 56 void (*completion_callback)(struct mca_mtl_request_t* mtl_request); 57 }; 58 typedef struct mca_mtl_request_t mca_mtl_request_t; 59 60 61 /** 62 * MTL module flags 63 */ 64 #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001 65 #if OPAL_CUDA_SUPPORT 66 #define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002 67 #endif 68 69 /** 70 * Initialization routine for MTL component 71 * 72 * Initialization routine for MTL component. This function should 73 * allocate resources for communication and try to do all local setup. 74 * It should not attempt to contact it's peers, as that should be 75 * done at add_procs time. Contact information should be published 76 * during this initialization function. It will be made available 77 * during add_procs(). 78 * 79 * @param enable_progress_threads (IN) Progress threads have been 80 * enabled by the user and the component must be 81 * capable of making asycnhronous progress (either 82 * with its own thread, with the kernel, or with 83 * the event library. 84 * @param enable_mpi_threads (IN) MPI threads have been enabled by the 85 * user and the component must be capable of coping 86 * with threads. If the component can cope with 87 * MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple 88 * should be set to true. Otherwise, it is assumed 89 * that only THREAD_FUNNELLED and THREAD_SERIALIZED 90 * can be used. 91 * @param enable_mpi_thread_multiple (OUT) Component does / does not 92 * support MPI_THREAD_MULTIPLE. This variable only 93 * needs to be set if enable_mpi_threads is true. 94 * Otherwise, the return value will be ignored. 95 * 96 * @retval NULL component can not operate on the current machine 97 * @retval non-NULL component interface function 98 */ 99 typedef struct mca_mtl_base_module_t* 100 (*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads, 101 bool enable_mpi_threads); 102 103 104 struct mca_mtl_base_component_2_0_0_t { 105 mca_base_component_t mtl_version; 106 mca_base_component_data_t mtl_data; 107 mca_mtl_base_component_init_fn_t mtl_init; 108 }; 109 typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t; 110 typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t; 111 112 113 /** 114 * MCA->MTL Clean up any resources held by MTL module 115 * 116 * Opposite of module_init. Called when communication will no longer 117 * be necessary. ussually this is during MPI_FINALIZE, but it can be 118 * earlier if the component was not selected to run. Assuming 119 * module_init was called, finalize will always be called before the 120 * component_close function is called. 121 * 122 * @param mtl (IN) MTL module returned from call to initialize 123 * 124 * @retval OMPI_SUCCESS cleanup finished successfully 125 * @retval other failure during cleanup 126 * 127 */ 128 typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl); 129 130 131 /** 132 * PML->MTL notification of change in the process list. 133 * 134 * The mca_mtl_base_module_add_procs_fn_t() is used by the PML to 135 * notify the MTL that new processes are connected to the current 136 * process. Any addressing information exported by the peer via the 137 * ompi_modex_send() function should be available during this 138 * call via the corresponding ompi_modex_recv() function. The 139 * MTL may utilize this information to determine reachability of each 140 * peer process. 141 * 142 * It is an error for a proc to not be reachable by the given MTL, and 143 * an error should be returned if that case is detected. If a MTL 144 * requires per-endpoint data, it must handle storage, either using a 145 * static endpoint tag (MTL is the default tag that should generally 146 * be used) or a dynamic endpoint tag (although it should be noted 147 * that OMPI can be built without dynamic endpoint tag support). 148 * 149 * @param mtl (IN) MTL module 150 * @param nprocs (IN) Number of processes 151 * @param procs (IN) Set of processes 152 * 153 * @retval OMPI_SUCCESS successfully connected to processes 154 * @retval other failure during setup 155 */ 156 typedef int (*mca_mtl_base_module_add_procs_fn_t)( 157 struct mca_mtl_base_module_t* mtl, 158 size_t nprocs, 159 struct ompi_proc_t** procs); 160 161 162 /** 163 * Notification of change to the process list. 164 * 165 * When the process list changes, the PML notifies the MTL of the 166 * change, to provide the opportunity to cleanup or release any 167 * resources associated with the peer. The MTL is responsible for 168 * releasing any memory associated with the endpoint data it may have 169 * stored during add_procs(). 170 * 171 * @param mtl (IN) MTL module 172 * @param nprocs (IN) Number of processes 173 * @param proc (IN) Set of processes 174 * @param peer (IN) Set of peer addressing information. 175 * 176 * @return Status indicating if cleanup was successful 177 */ 178 typedef int (*mca_mtl_base_module_del_procs_fn_t)( 179 struct mca_mtl_base_module_t* mtl, 180 size_t nprocs, 181 struct ompi_proc_t** procs); 182 183 184 /** 185 * Blocking send to peer 186 * 187 * Blocking send (Call should not return until the user buffer may be 188 * used again). Standard MPI semantics must be met by this call, as 189 * mandated in the mode argument. There is one special mode argument, 190 * MCA_PML_BASE_SEND_COMPLETE, which requires local completion before 191 * the function can return. This is an optimization for coillective 192 * routines that can otherwise lead to degenerate performance for 193 * broadcast-based collectives. 194 * 195 * @param comm (IN) Communicator used for operation 196 * @param dest (IN) Destination rank for send (relative to comm) 197 * @param tag (IN) MPI tag used for sending. See note below. 198 * @param convertor (IN) Datatype convertor describing send datatype. 199 * Already prepared for send. 200 * @param mode (IN) Mode for send operation 201 * 202 * @return OMPI_SUCCESS or error value 203 * 204 * \note Open MPI is built around non-blocking operations. This 205 * function is provided for networks where progressing events outside 206 * of point-to-point (for example, collectives, I/O, one-sided) can 207 * occur without a progress function regularily being triggered. 208 * 209 * \note While MPI does not allow users to specify negative tags, they 210 * are used internally in Open MPI to provide a unique channel for 211 * collective operations. Therefore, the MTL can *not* cause an error 212 * if a negative tag is used. 213 */ 214 typedef int (*mca_mtl_base_module_send_fn_t)( 215 struct mca_mtl_base_module_t* mtl, 216 struct ompi_communicator_t *comm, 217 int dest, 218 int tag, 219 struct opal_convertor_t *convertor, 220 mca_pml_base_send_mode_t mode); 221 222 223 /** 224 * Non-blocking send to peer 225 * 226 * Non-blocking send to peer. Standard MPI semantics must be met by 227 * this call, as mandated in the mode argument. There is one special 228 * mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local 229 * completion before the request is marked as complete. 230 * 231 * The PML will handle creation of the request, leaving the number of 232 * bytes requested in the module structure available for the MTL 233 * directly after the ompi_request_t structure. The PML will handle 234 * proper destruction of the request once it can safely be destructed 235 * (it has been completed and freeed by a call to REQUEST_FReE or 236 * TEST/WAIT). The MTL should remove all resources associated with 237 * the request when it is marked as completed. 238 * 239 * @param comm (IN) Communicator used for operation 240 * @param dest (IN) Destination rank for send (relative to comm) 241 * @param tag (IN) MPI tag used for sending. See note below. 242 * @param convertor (IN) Datatype convertor describing send datatype. 243 * Already prepared for send. 244 * @param mode (IN) Mode for send operation (see pml.h) 245 * @param blocking (IN) True if the call originated from a blocking 246 * call, but the PML decided to use a 247 * non-blocking operation, likely for 248 * internal performance decisions This is an 249 * optimization flag and is not needed for 250 * correctness. 251 * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field 252 * will be populated with an initialized 253 * ompi_request_t before calling. 254 * 255 * @return OMPI_SUCCESS or error value 256 * 257 * \note While MPI does not allow users to specify negative tags, they 258 * are used internally in Open MPI to provide a unique channel for 259 * collective operations. Therefore, the MTL can *not* cause an error 260 * if a negative tag is used. 261 */ 262 typedef int (*mca_mtl_base_module_isend_fn_t)( 263 struct mca_mtl_base_module_t* mtl, 264 struct ompi_communicator_t *comm, 265 int dest, 266 int tag, 267 struct opal_convertor_t *convertor, 268 mca_pml_base_send_mode_t mode, 269 bool blocking, 270 mca_mtl_request_t *mtl_request); 271 272 273 /** 274 * Non-blocking receive 275 * 276 * Non-blocking receive function. Standard MPI semantics for 277 * MPI_Irecv must be implemented by this call. 278 * 279 * The PML will handle creation of the request, leaving the number of 280 * bytes requested in teh module structure available for the MTL, 281 * directly after the ompi_request_t structure. The PML will handle 282 * proper destruction of the request once it can safely be destroyed 283 * (it has been completed and free'ed by a call to REQUEST_FREE or 284 * TEST/WAIT). The MTL should remove all resources associated with 285 * the request when it is marked as completed. 286 * 287 * @param comm (IN) Communicator used for operation 288 * @param src (IN) Source rank for send (relative to comm) 289 * @param tag (IN) MPI tag used for sending. See note below. 290 * @param convertor (IN) Datatype convertor describing receive datatype. 291 * Already prepared for receive. 292 * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field 293 * will be populated with an initialized 294 * ompi_request_t before calling. 295 * 296 * @return OMPI_SUCCESS or error value 297 * 298 * \note While MPI does not allow users to specify negative tags, they 299 * are used internally in Open MPI to provide a unique channel for 300 * collective operations. Therefore, the MTL can *not* cause an error 301 * if a negative tag is used. Further, MPI_ANY_TAG should *not* match 302 * against negative tags. 303 */ 304 typedef int (*mca_mtl_base_module_irecv_fn_t)( 305 struct mca_mtl_base_module_t* mtl, 306 struct ompi_communicator_t *comm, 307 int src, 308 int tag, 309 struct opal_convertor_t *convertor, 310 struct mca_mtl_request_t *mtl_request); 311 312 313 /** 314 * Non-blocking probe 315 * 316 * Non-blocking probe function. Standard MPI semantics for MPI_IPROBE 317 * must be implemented by this call. 318 * 319 * @param comm (IN) Communicator used for operation 320 * @param src (IN) Source rank for send (relative to comm) 321 * @param tag (IN) MPI tag used for sending. See note below. 322 * @param flag (OUT) true if message available, false otherwise 323 * @param status (OUT) Status structure for information on 324 * available message 325 * 326 * \note While MPI does not allow users to specify negative tags, they 327 * are used internally in Open MPI to provide a unique channel for 328 * collective operations. Therefore, the MTL can *not* cause an error 329 * if a negative tag is used. Further, MPI_ANY_TAG should *not* match 330 * against negative tags. 331 */ 332 typedef int (*mca_mtl_base_module_iprobe_fn_t)( 333 struct mca_mtl_base_module_t* mtl, 334 struct ompi_communicator_t *comm, 335 int src, 336 int tag, 337 int *flag, 338 struct ompi_status_public_t *status); 339 340 341 typedef int (*mca_mtl_base_module_imrecv_fn_t)(struct mca_mtl_base_module_t* mtl, 342 struct opal_convertor_t *convertor, 343 struct ompi_message_t **message, 344 struct mca_mtl_request_t *mtl_request); 345 346 typedef int (*mca_mtl_base_module_improbe_fn_t)(struct mca_mtl_base_module_t *mtl, 347 struct ompi_communicator_t *comm, 348 int src, 349 int tag, 350 int *matched, 351 struct ompi_message_t **message, 352 struct ompi_status_public_t *status); 353 354 /** 355 * Cancel an existing request 356 * 357 * Attempt to cancel an existing request. The (poorly defined) 358 * semantics for MPI_CANCEL must be implemented by this call. This, 359 * of course, allows the MTL module to do nothing at all. 360 * Implementations of the MTL should make a good faith effort to 361 * cancel receive requests that have not been started, as the "post a 362 * receive for control messages" paradigm is a common one in loosely 363 * coupled MPI applications. 364 * 365 * @param request(IN) Request that should be cancelled 366 * @param flag Unknown exactly what this does. 367 * 368 */ 369 typedef int (*mca_mtl_base_module_cancel_fn_t)( 370 struct mca_mtl_base_module_t* mtl, 371 mca_mtl_request_t *mtl_request, 372 int flag); 373 374 375 /** 376 * Downcall from PML layer when a new communicator is created. 377 * 378 * @param comm Communicator 379 * @return OMPI_SUCCESS or failure status. 380 * 381 * Provides the MTL the opportunity to initialize/cache a data structure 382 * on the communicator. 383 */ 384 typedef int (*mca_mtl_base_module_add_comm_fn_t)( 385 struct mca_mtl_base_module_t* mtl, 386 struct ompi_communicator_t* comm); 387 388 389 /** 390 * Downcall from PML layer when a communicator is destroyed. 391 * 392 * @param comm Communicator 393 * @return OMPI_SUCCESS or failure status. 394 * 395 * Provides the MTL the opportunity to cleanup any datastructures 396 * associated with the communicator. 397 */ 398 typedef int (*mca_mtl_base_module_del_comm_fn_t)( 399 struct mca_mtl_base_module_t* mtl, 400 struct ompi_communicator_t* comm); 401 402 403 /** 404 * MTL module interface functions and attributes. 405 */ 406 struct mca_mtl_base_module_t { 407 int mtl_max_contextid; /**< maximum allowable contextid */ 408 int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */ 409 size_t mtl_request_size; /**< number of bytes to reserve with request structure */ 410 411 uint32_t mtl_flags; /**< flags (put/get...) */ 412 413 /* MTL function table */ 414 mca_mtl_base_module_add_procs_fn_t mtl_add_procs; 415 mca_mtl_base_module_del_procs_fn_t mtl_del_procs; 416 mca_mtl_base_module_finalize_fn_t mtl_finalize; 417 418 mca_mtl_base_module_send_fn_t mtl_send; 419 mca_mtl_base_module_isend_fn_t mtl_isend; 420 mca_mtl_base_module_irecv_fn_t mtl_irecv; 421 mca_mtl_base_module_iprobe_fn_t mtl_iprobe; 422 mca_mtl_base_module_imrecv_fn_t mtl_imrecv; 423 mca_mtl_base_module_improbe_fn_t mtl_improbe; 424 425 /* Optional MTL functions */ 426 mca_mtl_base_module_cancel_fn_t mtl_cancel; 427 mca_mtl_base_module_add_comm_fn_t mtl_add_comm; 428 mca_mtl_base_module_del_comm_fn_t mtl_del_comm; 429 }; 430 typedef struct mca_mtl_base_module_t mca_mtl_base_module_t; 431 432 /* 433 * Macro for use in modules that are of type mtl 434 */ 435 #define MCA_MTL_BASE_VERSION_2_0_0 \ 436 OMPI_MCA_BASE_VERSION_2_1_0("mtl", 2, 0, 0) 437 438 OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl; 439 440 /* 441 * macro for doing direct call / call through struct 442 */ 443 #if MCA_ompi_mtl_DIRECT_CALL 444 445 446 #define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b 447 #define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b) 448 #define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_ompi_mtl_DIRECT_CALL_COMPONENT, a) 449 450 #include MCA_ompi_mtl_DIRECT_CALL_HEADER 451 452 #else 453 #define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a 454 #endif 455 456 457 END_C_DECLS 458 #endif