1 /* 2 * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. 3 * Copyright (c) 2016 Research Organization for Information Science 4 * and Technology (RIST). All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are 8 * met: 9 * 10 * - Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * - Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer listed 15 * in this license in the documentation and/or other materials 16 * provided with the distribution. 17 * 18 * - Neither the name of the copyright holders nor the names of its 19 * contributors may be used to endorse or promote products derived from 20 * this software without specific prior written permission. 21 * 22 * The copyright holders provide no reassurances that the source code 23 * provided does not infringe any patent, copyright, or any other 24 * intellectual property rights of third parties. The copyright holders 25 * disclaim any liability to any recipient for claims brought against 26 * recipient by any third party for infringement of that parties 27 * intellectual property rights. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 * 41 * $COPYRIGHT$ 42 * 43 * Additional copyrights may follow 44 * 45 * $HEADER$ 46 */ 47 48 #ifndef PMIx_H 49 #define PMIx_H 50 51 /* Structure and constant definitions */ 52 #include <pmix_common.h> 53 54 #if defined(c_plusplus) || defined(__cplusplus) 55 extern "C" { 56 #endif 57 58 /**** PMIX API ****/ 59 60 /* Initialize the PMIx client, returning the process identifier assigned 61 * to this client's application in the provided pmix_proc_t struct. 62 * Passing a parameter of _NULL_ for this parameter is allowed if the user 63 * wishes solely to initialize the PMIx system and does not require 64 * return of the identifier at that time. 65 * 66 * When called the PMIx client will check for the required connection 67 * information of the local PMIx server and will establish the connection. 68 * If the information is not found, or the server connection fails, then 69 * an appropriate error constant will be returned. 70 * 71 * If successful, the function will return PMIX_SUCCESS and will fill the 72 * provided structure with the server-assigned namespace and rank of the 73 * process within the application. 74 * 75 * Note that the PMIx client library is referenced counted, and so multiple 76 * calls to PMIx_Init are allowed. Thus, one way to obtain the namespace and 77 * rank of the process is to simply call PMIx_Init with a non-NULL parameter. 78 * 79 * The info array is used to pass user requests pertaining to the init 80 * and subsequent operations. Pass a _NULL_ value for the array pointer 81 * is supported if no directives are desired. 82 */ 83 PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, 84 pmix_info_t info[], size_t ninfo); 85 86 /* Finalize the PMIx client, closing the connection to the local server. 87 * An error code will be returned if, for some reason, the connection 88 * cannot be closed. 89 * 90 * The info array is used to pass user requests regarding the finalize 91 * operation. This can include: 92 * 93 * (a) PMIX_EMBED_BARRIER - By default, PMIx_Finalize does not include an 94 * internal barrier operation. This attribute directs PMIx_Finalize to 95 * execute a barrier as part of the finalize operation. 96 */ 97 PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo); 98 99 100 /* Returns _true_ if the PMIx client has been successfully initialized, 101 * returns _false_ otherwise. Note that the function only reports the 102 * internal state of the PMIx client - it does not verify an active 103 * connection with the server, nor that the server is functional. */ 104 PMIX_EXPORT int PMIx_Initialized(void); 105 106 107 /* Request that the provided array of procs be aborted, returning the 108 * provided _status_ and printing the provided message. A _NULL_ 109 * for the proc array indicates that all processes in the caller's 110 * nspace are to be aborted. 111 * 112 * The response to this request is somewhat dependent on the specific resource 113 * manager and its configuration (e.g., some resource managers will 114 * not abort the application if the provided _status_ is zero unless 115 * specifically configured to do so), and thus lies outside the control 116 * of PMIx itself. However, the client will inform the RM of 117 * the request that the application be aborted, regardless of the 118 * value of the provided _status_. 119 * 120 * Passing a _NULL_ msg parameter is allowed. Note that race conditions 121 * caused by multiple processes calling PMIx_Abort are left to the 122 * server implementation to resolve with regard to which status is 123 * returned and what messages (if any) are printed. */ 124 PMIX_EXPORT pmix_status_t PMIx_Abort(int status, const char msg[], 125 pmix_proc_t procs[], size_t nprocs); 126 127 128 /* Push a value into the client's namespace. The client library will cache 129 * the information locally until _PMIx_Commit_ is called. The provided scope 130 * value is passed to the local PMIx server, which will distribute the data 131 * as directed. */ 132 PMIX_EXPORT pmix_status_t PMIx_Put(pmix_scope_t scope, const pmix_key_t key, pmix_value_t *val); 133 134 135 /* Push all previously _PMIx_Put_ values to the local PMIx server. 136 * This is an asynchronous operation - the library will immediately 137 * return to the caller while the data is transmitted to the local 138 * server in the background */ 139 PMIX_EXPORT pmix_status_t PMIx_Commit(void); 140 141 142 /* Execute a blocking barrier across the processes identified in the 143 * specified array. Passing a _NULL_ pointer as the _procs_ parameter 144 * indicates that the barrier is to span all processes in the client's 145 * namespace. Each provided pmix_proc_t struct can pass PMIX_RANK_WILDCARD to 146 * indicate that all processes in the given namespace are 147 * participating. 148 * 149 * The info array is used to pass user requests regarding the fence 150 * operation. This can include: 151 * 152 * (a) PMIX_COLLECT_DATA - a boolean indicating whether or not the barrier 153 * operation is to return the _put_ data from all participating processes. 154 * A value of _false_ indicates that the callback is just used as a release 155 * and no data is to be returned at that time. A value of _true_ indicates 156 * that all _put_ data is to be collected by the barrier. Returned data is 157 * cached at the server to reduce memory footprint, and can be retrieved 158 * as needed by calls to PMIx_Get(nb). 159 * 160 * Note that for scalability reasons, the default behavior for PMIx_Fence 161 * is to _not_ collect the data. 162 * 163 * (b) PMIX_COLLECTIVE_ALGO - a comma-delimited string indicating the algos 164 * to be used for executing the barrier, in priority order. 165 * 166 * (c) PMIX_COLLECTIVE_ALGO_REQD - instructs the host RM that it should return 167 * an error if none of the specified algos are available. Otherwise, the RM 168 * is to use one of the algos if possible, but is otherwise free to use any 169 * of its available methods to execute the operation. 170 * 171 * (d) PMIX_TIMEOUT - maximum time for the fence to execute before declaring 172 * an error. By default, the RM shall terminate the operation and notify participants 173 * if one or more of the indicated procs fails during the fence. However, 174 * the timeout parameter can help avoid "hangs" due to programming errors 175 * that prevent one or more procs from reaching the "fence". 176 */ 177 PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs, 178 const pmix_info_t info[], size_t ninfo); 179 180 /* Non-blocking version of PMIx_Fence. Note that the function will return 181 * an error if a _NULL_ callback function is given. */ 182 PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs, 183 const pmix_info_t info[], size_t ninfo, 184 pmix_op_cbfunc_t cbfunc, void *cbdata); 185 186 187 /* Retrieve information for the specified _key_ as published by the process 188 * identified in the given pmix_proc_t, returning a pointer to the value in the 189 * given address. 190 * 191 * This is a blocking operation - the caller will block until 192 * the specified data has been _PMIx_Put_ by the specified rank. The caller is 193 * responsible for freeing all memory associated with the returned value when 194 * no longer required. 195 * 196 * The info array is used to pass user requests regarding the get 197 * operation. This can include: 198 * 199 * (a) PMIX_TIMEOUT - maximum time for the get to execute before declaring 200 * an error. The timeout parameter can help avoid "hangs" due to programming 201 * errors that prevent the target proc from ever exposing its data. 202 */ 203 PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[], 204 const pmix_info_t info[], size_t ninfo, 205 pmix_value_t **val); 206 207 /* A non-blocking operation version of PMIx_Get - the callback function will 208 * be executed once the specified data has been _PMIx_Put_ 209 * by the identified process and retrieved by the local server. The info 210 * array is used as described above for the blocking form of this call. */ 211 PMIX_EXPORT pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const pmix_key_t key, 212 const pmix_info_t info[], size_t ninfo, 213 pmix_value_cbfunc_t cbfunc, void *cbdata); 214 215 216 /* Publish the data in the info array for lookup. By default, 217 * the data will be published into the PMIX_SESSION range and 218 * with PMIX_PERSIST_APP persistence. Changes to those values, 219 * and any additional directives, can be included in the pmix_info_t 220 * array. 221 * 222 * Note that the keys must be unique within the specified 223 * data range or else an error will be returned (first published 224 * wins). Attempts to access the data by procs outside of 225 * the provided data range will be rejected. 226 * 227 * The persistence parameter instructs the server as to how long 228 * the data is to be retained. 229 * 230 * The blocking form will block until the server confirms that the 231 * data has been posted and is available. The non-blocking form will 232 * return immediately, executing the callback when the server confirms 233 * availability of the data. 234 */ 235 PMIX_EXPORT pmix_status_t PMIx_Publish(const pmix_info_t info[], size_t ninfo); 236 PMIX_EXPORT pmix_status_t PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo, 237 pmix_op_cbfunc_t cbfunc, void *cbdata); 238 239 240 /* Lookup information published by this or another process. By default, 241 * the search will be conducted across the PMIX_SESSION range. Changes 242 * to the range, and any additional directives, can be provided 243 * in the pmix_info_t array. Note that the search is also constrained 244 * to only data published by the current user ID - i.e., the search 245 * will not return data published by an application being executed 246 * by another user. There currently is no option to override this 247 * behavior - such an option may become available later via an 248 * appropriate pmix_info_t directive. 249 * 250 * The "data" parameter consists of an array of pmix_pdata_t struct with the 251 * keys specifying the requested information. Data will be returned 252 * for each key in the associated info struct - any key that cannot 253 * be found will return with a data type of "PMIX_UNDEF". The function 254 * will return SUCCESS if _any_ values can be found, so the caller 255 * must check each data element to ensure it was returned. 256 * 257 * The proc field in each pmix_pdata_t struct will contain the 258 * nspace/rank of the process that published the data. 259 * 260 * Note: although this is a blocking function, it will _not_ wait 261 * by default for the requested data to be published. Instead, it 262 * will block for the time required by the server to lookup its current 263 * data and return any found items. Thus, the caller is responsible for 264 * ensuring that data is published prior to executing a lookup, or 265 * for retrying until the requested data is found 266 * 267 * Optionally, the info array can be used to modify this behavior 268 * by including: 269 * 270 * (a) PMIX_WAIT - wait for the requested data to be published. The 271 * server is to wait until all data has become available. 272 * 273 * (b) PMIX_TIMEOUT - max time to wait for data to become available. 274 * 275 */ 276 PMIX_EXPORT pmix_status_t PMIx_Lookup(pmix_pdata_t data[], size_t ndata, 277 const pmix_info_t info[], size_t ninfo); 278 279 /* Non-blocking form of the _PMIx_Lookup_ function. Data for 280 * the provided NULL-terminated keys array will be returned 281 * in the provided callback function. As above, the default 282 * behavior is to _not_ wait for data to be published. The 283 * info keys can be used to modify the behavior as previously 284 * described */ 285 PMIX_EXPORT pmix_status_t PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo, 286 pmix_lookup_cbfunc_t cbfunc, void *cbdata); 287 288 289 /* Unpublish data posted by this process using the given keys. 290 * The function will block until the data has been removed by 291 * the server. A value of _NULL_ for the keys parameter instructs 292 * the server to remove _all_ data published by this process. 293 * 294 * By default, the range is assumed to be PMIX_SESSION. Changes 295 * to the range, and any additional directives, can be provided 296 * in the pmix_info_t array */ 297 PMIX_EXPORT pmix_status_t PMIx_Unpublish(char **keys, 298 const pmix_info_t info[], size_t ninfo); 299 300 /* Non-blocking form of the _PMIx_Unpublish_ function. The 301 * callback function will be executed once the server confirms 302 * removal of the specified data. */ 303 PMIX_EXPORT pmix_status_t PMIx_Unpublish_nb(char **keys, 304 const pmix_info_t info[], size_t ninfo, 305 pmix_op_cbfunc_t cbfunc, void *cbdata); 306 307 308 /* Spawn a new job. The assigned namespace of the spawned applications 309 * is returned in the nspace parameter - a _NULL_ value in that 310 * location indicates that the caller doesn't wish to have the 311 * namespace returned. The nspace array must be at least of size 312 * PMIX_MAX_NSLEN+1. Behavior of individual resource managers 313 * may differ, but it is expected that failure of any application 314 * process to start will result in termination/cleanup of _all_ 315 * processes in the newly spawned job and return of an error 316 * code to the caller. 317 * 318 * By default, the spawned processes will be PMIx "connected" to 319 * the parent process upon successful launch (see PMIx_Connect 320 * description for details). Note that this only means that the 321 * parent process (a) will be given a copy of the new job's 322 * information so it can query job-level info without 323 * incurring any communication penalties, and (b) will receive 324 * notification of errors from process in the child job. 325 * 326 * Job-level directives can be specified in the job_info array. This 327 * can include: 328 * 329 * (a) PMIX_NON_PMI - processes in the spawned job will 330 * not be calling PMIx_Init 331 * 332 * (b) PMIX_TIMEOUT - declare the spawn as having failed if the launched 333 * procs do not call PMIx_Init within the specified time 334 * 335 * (c) PMIX_NOTIFY_COMPLETION - notify the parent process when the 336 * child job terminates, either normally or with error 337 */ 338 PMIX_EXPORT pmix_status_t PMIx_Spawn(const pmix_info_t job_info[], size_t ninfo, 339 const pmix_app_t apps[], size_t napps, 340 pmix_nspace_t nspace); 341 342 343 /* Non-blocking form of the _PMIx_Spawn_ function. The callback 344 * will be executed upon launch of the specified applications, 345 * or upon failure to launch any of them. */ 346 PMIX_EXPORT pmix_status_t PMIx_Spawn_nb(const pmix_info_t job_info[], size_t ninfo, 347 const pmix_app_t apps[], size_t napps, 348 pmix_spawn_cbfunc_t cbfunc, void *cbdata); 349 350 /* Record the specified processes as "connected". Both blocking and non-blocking 351 * versions are provided. This means that the resource manager should treat the 352 * failure of any process in the specified group as a reportable event, and take 353 * appropriate action. Note that different resource managers may respond to 354 * failures in different manners. 355 * 356 * The callback function is to be called once all participating processes have 357 * called connect. The server is required to return any job-level info for the 358 * connecting processes that might not already have - i.e., if the connect 359 * request involves procs from different nspaces, then each proc shall receive 360 * the job-level info from those nspaces other than their own. 361 * 362 * Note: a process can only engage in _one_ connect operation involving the identical 363 * set of processes at a time. However, a process _can_ be simultaneously engaged 364 * in multiple connect operations, each involving a different set of processes 365 * 366 * As in the case of the fence operation, the info array can be used to pass 367 * user-level directives regarding the algorithm to be used for the collective 368 * operation involved in the "connect", timeout constraints, and other options 369 * available from the host RM */ 370 PMIX_EXPORT pmix_status_t PMIx_Connect(const pmix_proc_t procs[], size_t nprocs, 371 const pmix_info_t info[], size_t ninfo); 372 373 PMIX_EXPORT pmix_status_t PMIx_Connect_nb(const pmix_proc_t procs[], size_t nprocs, 374 const pmix_info_t info[], size_t ninfo, 375 pmix_op_cbfunc_t cbfunc, void *cbdata); 376 377 /* Disconnect a previously connected set of processes. An error will be returned 378 * if the specified set of procs was not previously "connected". As above, a process 379 * may be involved in multiple simultaneous disconnect operations. However, a process 380 * is not allowed to reconnect to a set of procs that has not fully completed 381 * disconnect - i.e., you have to fully disconnect before you can reconnect to the 382 * _same_ group of processes. The info array is used as above. */ 383 PMIX_EXPORT pmix_status_t PMIx_Disconnect(const pmix_proc_t procs[], size_t nprocs, 384 const pmix_info_t info[], size_t ninfo); 385 386 PMIX_EXPORT pmix_status_t PMIx_Disconnect_nb(const pmix_proc_t ranges[], size_t nprocs, 387 const pmix_info_t info[], size_t ninfo, 388 pmix_op_cbfunc_t cbfunc, void *cbdata); 389 390 /* Given a node name, return an array of processes within the specified nspace 391 * on that node. If the nspace is NULL, then all processes on the node will 392 * be returned. If the specified node does not currently host any processes, 393 * then the returned array will be NULL, and nprocs=0. The caller is responsible 394 * for releasing the array when done with it - the PMIX_PROC_FREE macro is 395 * provided for this purpose. 396 */ 397 PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, 398 const pmix_nspace_t nspace, 399 pmix_proc_t **procs, size_t *nprocs); 400 401 402 /* Given an nspace, return the list of nodes hosting processes within 403 * that nspace. The returned string will contain a comma-delimited list 404 * of nodenames. The caller is responsible for releasing the string 405 * when done with it */ 406 PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char **nodelist); 407 408 /* Query information about the system in general - can include 409 * a list of active nspaces, network topology, etc. Also can be 410 * used to query node-specific info such as the list of peers 411 * executing on a given node. We assume that the host RM will 412 * exercise appropriate access control on the information. 413 * 414 * NOTE: there is no blocking form of this API as the structures 415 * passed to query info differ from those for receiving the results 416 * 417 * The following return status codes are provided in the callback: 418 * 419 * PMIX_SUCCESS - all data has been returned 420 * PMIX_ERR_NOT_FOUND - none of the requested data was available 421 * PMIX_ERR_PARTIAL_SUCCESS - some of the data has been returned 422 * PMIX_ERR_NOT_SUPPORTED - the host RM does not support this function 423 */ 424 PMIX_EXPORT pmix_status_t PMIx_Query_info_nb(pmix_query_t queries[], size_t nqueries, 425 pmix_info_cbfunc_t cbfunc, void *cbdata); 426 427 /* Log data to a central data service/store, subject to the 428 * services offered by the host resource manager. The data to 429 * be logged is provided in the data array. The (optional) directives 430 * can be used to request specific storage options and direct 431 * the choice of storage option. 432 * 433 * The callback function will be executed when the log operation 434 * has been completed. The data array must be maintained until 435 * the callback is provided 436 */ 437 PMIX_EXPORT pmix_status_t PMIx_Log(const pmix_info_t data[], size_t ndata, 438 const pmix_info_t directives[], size_t ndirs); 439 440 PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata, 441 const pmix_info_t directives[], size_t ndirs, 442 pmix_op_cbfunc_t cbfunc, void *cbdata); 443 444 /* Request an allocation operation from the host resource manager. 445 * Several broad categories are envisioned, including the ability to: 446 * 447 * - request allocation of additional resources, including memory, 448 * bandwidth, and compute. This should be accomplished in a 449 * non-blocking manner so that the application can continue to 450 * progress while waiting for resources to become available. Note 451 * that the new allocation will be disjoint from (i.e., not 452 * affiliated with) the allocation of the requestor - thus the 453 * termination of one allocation will not impact the other. 454 * 455 * - extend the reservation on currently allocated resources, subject 456 * to scheduling availability and priorities. This includes extending 457 * the time limit on current resources, and/or requesting additional 458 * resources be allocated to the requesting job. Any additional 459 * allocated resources will be considered as part of the current 460 * allocation, and thus will be released at the same time. 461 * 462 * - release currently allocated resources that are no longer required. 463 * This is intended to support partial release of resources since all 464 * resources are normally released upon termination of the job. The 465 * identified use-cases include resource variations across discrete steps 466 * of a workflow, as well as applications that spawn sub-jobs and/or 467 * dynamically grow/shrink over time 468 * 469 * - "lend" resources back to the scheduler with an expectation of getting 470 * them back at some later time in the job. This can be a proactive 471 * operation (e.g., to save on computing costs when resources are 472 * temporarily not required), or in response to scheduler requests in 473 * lieue of preemption. A corresponding ability to "reacquire" resources 474 * previously released is included. 475 */ 476 PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive, 477 pmix_info_t *info, size_t ninfo); 478 479 PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive, 480 pmix_info_t *info, size_t ninfo, 481 pmix_info_cbfunc_t cbfunc, void *cbdata); 482 483 /* Request a job control action. The targets array identifies the 484 * processes to which the requested job control action is to be applied. 485 * A NULL value can be used to indicate all processes in the caller's 486 * nspace. The use of PMIX_RANK_WILDARD can also be used to indicate 487 * that all processes in the given nspace are to be included. 488 * 489 * The directives are provided as pmix_info_t structs in the directives 490 * array. The callback function provides a status to indicate whether or 491 * not the request was granted, and to provide some information as to 492 * the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t 493 * structures. If non-NULL, then the specified release_fn must be called 494 * when the callback function completes - this will be used to release 495 * any provided pmix_info_t array. 496 */ 497 PMIX_EXPORT pmix_status_t PMIx_Job_control(const pmix_proc_t targets[], size_t ntargets, 498 const pmix_info_t directives[], size_t ndirs); 499 500 PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets, 501 const pmix_info_t directives[], size_t ndirs, 502 pmix_info_cbfunc_t cbfunc, void *cbdata); 503 504 /* Request that something be monitored - e.g., that the server monitor 505 * this process for periodic heartbeats as an indication that the process 506 * has not become "wedged". When a monitor detects the specified alarm 507 * condition, it will generate an event notification using the provided 508 * error code and passing along any available relevant information. It is 509 * up to the caller to register a corresponding event handler. 510 * 511 * Params: 512 * 513 * monitor: attribute indicating the type of monitor being requested - e.g., 514 * PMIX_MONITOR_FILE to indicate that the requestor is asking that 515 * a file be monitored. 516 * 517 * error: the status code to be used when generating an event notification 518 * alerting that the monitor has been triggered. The range of the 519 * notification defaults to PMIX_RANGE_NAMESPACE - this can be 520 * changed by providing a PMIX_RANGE directive 521 * 522 * directives: characterize the monitoring request (e.g., monitor file size) 523 * and frequency of checking to be done 524 * 525 * cbfunc: provides a status to indicate whether or not the request was granted, 526 * and to provide some information as to the reason for any denial in 527 * the pmix_info_cbfunc_t array of pmix_info_t structures. 528 * 529 * Note: a process can send a heartbeat to the server using the PMIx_Heartbeat 530 * macro provided below*/ 531 PMIX_EXPORT pmix_status_t PMIx_Process_monitor(const pmix_info_t *monitor, pmix_status_t error, 532 const pmix_info_t directives[], size_t ndirs); 533 534 PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error, 535 const pmix_info_t directives[], size_t ndirs, 536 pmix_info_cbfunc_t cbfunc, void *cbdata); 537 538 /* define a special macro to simplify sending of a heartbeat */ 539 #define PMIx_Heartbeat() \ 540 do { \ 541 pmix_info_t _in; \ 542 PMIX_INFO_CONSTRUCT(&_in); \ 543 PMIX_INFO_LOAD(&_in, PMIX_SEND_HEARTBEAT, NULL, PMIX_POINTER); \ 544 PMIx_Process_monitor_nb(&_in, PMIX_SUCCESS, NULL, 0, NULL, NULL); \ 545 PMIX_INFO_DESTRUCT(&_in); \ 546 } while(0) 547 548 /* Request a credential from the PMIx server/SMS. 549 * Input values include: 550 * 551 * info - an array of pmix_info_t structures containing any directives the 552 * caller may wish to pass. Typical usage might include: 553 * PMIX_TIMEOUT - how long to wait (in seconds) for a credential 554 * before timing out and returning an error 555 * PMIX_CRED_TYPE - a prioritized, comma-delimited list of desired 556 * credential types for use in environments where 557 * multiple authentication mechanisms may be 558 * available 559 * 560 * ninfo - number of elements in the info array 561 * 562 * cbfunc - the pmix_credential_cbfunc_t function to be called upon completion 563 * of the request 564 * 565 * cbdata - pointer to an object to be returned when cbfunc is called 566 * 567 * Returned values: 568 * PMIX_SUCCESS - indicates that the request has been successfully communicated to 569 * the local PMIx server. The response will be coming in the provided 570 * callback function. 571 * 572 * Any other value indicates an appropriate error condition. The callback function 573 * will _not_ be called in such cases. 574 */ 575 PMIX_EXPORT pmix_status_t PMIx_Get_credential(const pmix_info_t info[], size_t ninfo, 576 pmix_credential_cbfunc_t cbfunc, void *cbdata); 577 578 579 /* Request validation of a credential by the PMIx server/SMS 580 * Input values include: 581 * 582 * cred - pointer to a pmix_byte_object_t containing the credential 583 * 584 * info - an array of pmix_info_t structures containing any directives the 585 * caller may wish to pass. Typical usage might include: 586 * PMIX_TIMEOUT - how long to wait (in seconds) for validation 587 * before timing out and returning an error 588 * PMIX_USERID - the expected effective userid of the credential 589 * to be validated 590 * PMIX_GROUPID - the expected effective group id of the credential 591 * to be validated 592 * 593 * ninfo - number of elements in the info array 594 * 595 * cbfunc - the pmix_validation_cbfunc_t function to be called upon completion 596 * of the request 597 * 598 * cbdata - pointer to an object to be returned when cbfunc is called 599 * 600 * Returned values: 601 * PMIX_SUCCESS - indicates that the request has been successfully communicated to 602 * the local PMIx server. The response will be coming in the provided 603 * callback function. 604 * 605 * Any other value indicates an appropriate error condition. The callback function 606 * will _not_ be called in such cases. 607 */ 608 PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cred, 609 const pmix_info_t info[], size_t ninfo, 610 pmix_validation_cbfunc_t cbfunc, void *cbdata); 611 612 /* Define a callback function for delivering forwarded IO to a process 613 * This function will be called whenever data becomes available, or a 614 * specified buffering size and/or time has been met. The function 615 * will be passed the following values: 616 * 617 * iofhdlr - the returned registration number of the handler being invoked. 618 * This is required when deregistering the handler. 619 * 620 * channel - a bitmask identifying the channel the data arrived on 621 * 622 * source - the nspace/rank of the process that generated the data 623 * 624 * payload - pointer to character array containing the data. Note that 625 * multiple strings may be included, and that the array may 626 * _not_ be NULL terminated 627 * 628 * info - an optional array of info provided by the source containing 629 * metadata about the payload. This could include PMIX_IOF_COMPLETE 630 * 631 * ninfo - number of elements in the optional info array 632 */ 633 typedef void (*pmix_iof_cbfunc_t)(size_t iofhdlr, pmix_iof_channel_t channel, 634 pmix_proc_t *source, char *payload, 635 pmix_info_t info[], size_t ninfo); 636 637 638 /* Register to receive output forwarded from a remote process. 639 * 640 * procs - array of identifiers for sources whose IO is being 641 * requested. Wildcard rank indicates that all procs 642 * in the specified nspace are included in the request 643 * 644 * nprocs - number of identifiers in the procs array 645 * 646 * directives - optional array of attributes to control the 647 * behavior of the request. For example, this 648 * might include directives on buffering IO 649 * before delivery, and/or directives to include 650 * or exclude any backlogged data 651 * 652 * ndirs - number of elements in the directives array 653 * 654 * channel - bitmask of IO channels included in the request. 655 * NOTE: STDIN is not supported as it will always 656 * be delivered to the stdin file descriptor 657 * 658 * cbfunc - function to be called when relevant IO is received 659 * 660 * regcbfunc - since registration is async, this is the 661 * function to be called when registration is 662 * completed. The function itself will return 663 * a non-success error if the registration cannot 664 * be submitted - in this case, the regcbfunc 665 * will _not_ be called. 666 * 667 * cbdata - pointer to object to be returned in regcbfunc 668 */ 669 PMIX_EXPORT pmix_status_t PMIx_IOF_pull(const pmix_proc_t procs[], size_t nprocs, 670 const pmix_info_t directives[], size_t ndirs, 671 pmix_iof_channel_t channel, pmix_iof_cbfunc_t cbfunc, 672 pmix_hdlr_reg_cbfunc_t regcbfunc, void *regcbdata); 673 674 /* Deregister from output forwarded from a remote process. 675 * 676 * iofhdlr - the registration number returned from the 677 * call to PMIx_IOF_pull 678 * 679 * directives - optional array of attributes to control the 680 * behavior of the request. For example, this 681 * might include directives regarding what to 682 * do with any data currently in the IO buffer 683 * for this process 684 * 685 * cbfunc - function to be called when deregistration has 686 * been completed. Note that any IO to be flushed 687 * may continue to be received after deregistration 688 * has completed. 689 * 690 * cbdata - pointer to object to be returned in cbfunc 691 */ 692 PMIX_EXPORT pmix_status_t PMIx_IOF_deregister(size_t iofhdlr, 693 const pmix_info_t directives[], size_t ndirs, 694 pmix_op_cbfunc_t cbfunc, void *cbdata); 695 696 /* Push data collected locally (typically from stdin) to 697 * stdin of target recipients. 698 * 699 * targets - array of process identifiers to which the data is to be delivered. Note 700 * that a WILDCARD rank indicates that all procs in the given nspace are 701 * to receive a copy of the data 702 * 703 * ntargets - number of procs in the targets array 704 * 705 * directives - optional array of attributes to control the 706 * behavior of the request. For example, this 707 * might include directives on buffering IO 708 * before delivery, and/or directives to include 709 * or exclude any backlogged data 710 * 711 * ndirs - number of elements in the directives array 712 * 713 * bo - pointer to a byte object containing the stdin data 714 * 715 * cbfunc - callback function when the data has been forwarded 716 * 717 * cbdata - object to be returned in cbfunc 718 */ 719 PMIX_EXPORT pmix_status_t PMIx_IOF_push(const pmix_proc_t targets[], size_t ntargets, 720 pmix_byte_object_t *bo, 721 const pmix_info_t directives[], size_t ndirs, 722 pmix_op_cbfunc_t cbfunc, void *cbdata); 723 724 /* Construct a new group composed of the specified processes and identified with 725 * the provided group identifier. Both blocking and non-blocking versions 726 * are provided (the callback function for the non-blocking form will be called 727 * once all specified processes have joined the group). The group identifier is 728 * a user-defined, NULL-terminated character array of length less than or equal 729 * to PMIX_MAX_NSLEN. Only characters accepted by standard string comparison 730 * functions (e.g., strncmp) are supported. 731 * 732 * Processes may engage in multiple simultaneous group construct operations as 733 * desired so long as each is provided with a unique group ID. The info array 734 * can be used to pass user-level directives regarding timeout constraints and 735 * other options available from the PMIx server. 736 * 737 * The construct leader (if PMIX_GROUP_LEADER is provided) or all participants 738 * will receive events (if registered for the PMIX_GROUP_MEMBER_FAILED event) 739 * whenever a process fails or terminates prior to calling 740 * PMIx_Group_construct(_nb) – the events will contain the identifier of the 741 * process that failed to join plus any other information that the resource 742 * manager provided. This provides an opportunity for the leader to react to 743 * the event – e.g., to invite an alternative member to the group or to decide 744 * to proceed with a smaller group. The decision to proceed with a smaller group 745 * is communicated to the PMIx library in the results array at the end of the 746 * event handler. This allows PMIx to properly adjust accounting for procedure 747 * completion. When construct is complete, the participating PMIx servers will 748 * be alerted to any change in participants and each group member will (if 749 * registered) receive a PMIX_GROUP_MEMBERSHIP_UPDATE event updating the group 750 * membership. 751 * 752 * Processes in a group under construction are not allowed to leave the group 753 * until group construction is complete. Upon completion of the construct 754 * procedure, each group member will have access to the job-level information 755 * of all nspaces represented in the group and the contact information for 756 * every group member. 757 * 758 * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event 759 * to be delivered to all participants so they can optionally declare a new leader. 760 * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in 761 * the results array in the return of the event handler. Only one process is 762 * allowed to return that attribute, declaring itself as the new leader. Results 763 * of the leader selection will be communicated to all participants via a 764 * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader 765 * was selected, then the status code provided in the event handler will provide 766 * an error value so the participants can take appropriate action. 767 * 768 * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed 769 * event handler will cause the construct process to abort. Those processes 770 * engaged in the blocking construct will return from the call with the 771 * PMIX_GROUP_CONSTRUCT_ABORT status. Non-blocking participants will have 772 * their callback function executed with that status. 773 * 774 * Some relevant attributes for this operation: 775 * PMIX_GROUP_LEADER - declare this process to be the leader of the construction 776 * procedure. If a process provides this attribute, then 777 * failure notification for any participating process will 778 * go only to that one process. In the absence of a 779 * declared leader, failure events go to all participants. 780 * PMIX_GROUP_OPTIONAL - participation is optional - do not return an error if 781 * any of the specified processes terminate 782 * without having joined (default=false) 783 * PMIX_GROUP_NOTIFY_TERMINATION - notify remaining members when another member 784 * terminates without first leaving the 785 * group (default=false) 786 * PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context 787 * ID (size_t) to the group. The value is returned 788 * in the PMIX_GROUP_CONSTRUCT_COMPLETE event 789 * PMIX_TIMEOUT - return an error if the group doesn't assemble within the 790 * specified number of seconds. Targets the scenario where a 791 * process fails to call PMIx_Group_connect due to hanging 792 * 793 */ 794 PMIX_EXPORT pmix_status_t PMIx_Group_construct(const char grp[], 795 const pmix_proc_t procs[], size_t nprocs, 796 const pmix_info_t directives[], size_t ndirs, 797 pmix_info_t **results, size_t *nresults); 798 799 PMIX_EXPORT pmix_status_t PMIx_Group_construct_nb(const char grp[], 800 const pmix_proc_t procs[], size_t nprocs, 801 const pmix_info_t info[], size_t ninfo, 802 pmix_info_cbfunc_t cbfunc, void *cbdata); 803 804 /* Explicitly invite specified processes to join a group. 805 * 806 * Each invited process will be notified of the invitation via the PMIX_GROUP_INVITED 807 * event. The processes being invited must have registered for the PMIX_GROUP_INVITED 808 * event in order to be notified of the invitation. When ready to respond, each invited 809 * process provides a response using the appropriate form of PMIx_Group_join. This will 810 * notify the inviting process that the invitation was either accepted (via the 811 * PMIX_GROUP_INVITE_ACCEPTED event) or declined (via the PMIX_GROUP_INVITE_DECLINED event). 812 * The inviting process will also receive PMIX_GROUP_MEMBER_FAILED events whenever a 813 * process fails or terminates prior to responding to the invitation. 814 * 815 * Upon accepting the invitation, both the inviting and invited process will receive 816 * access to the job-level information of each other’s nspaces and the contact 817 * information of the other process. 818 * 819 * Some relevant attributes for this operation: 820 * PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context 821 * ID (size_t) to the group. The value is returned 822 * in the PMIX_GROUP_CONSTRUCT_COMPLETE event 823 * PMIX_TIMEOUT (int): return an error if the group doesn’t assemble within the 824 * specified number of seconds. Targets the scenario where a 825 * process fails to call PMIx_Group_connect due to hanging 826 * 827 * The inviting process is automatically considered the leader of the asynchronous 828 * group construction procedure and will receive all failure or termination events 829 * for invited members prior to completion. The inviting process is required to 830 * provide a PMIX_GROUP_CONSTRUCT_COMPLETE event once the group has been fully 831 * assembled – this event will be distributed to all participants along with the 832 * final membership. 833 * 834 * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event 835 * to be delivered to all participants so they can optionally declare a new leader. 836 * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in 837 * the results array in the return of the event handler. Only one process is 838 * allowed to return that attribute, declaring itself as the new leader. Results 839 * of the leader selection will be communicated to all participants via a 840 * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader 841 * was selected, then the status code provided in the event handler will provide 842 * an error value so the participants can take appropriate action. 843 * 844 * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the event 845 * handler will cause all participants to receive an event notifying them 846 * of that status. 847 */ 848 PMIX_EXPORT pmix_status_t PMIx_Group_invite(const char grp[], 849 const pmix_proc_t procs[], size_t nprocs, 850 const pmix_info_t info[], size_t ninfo, 851 pmix_info_t **results, size_t *nresult); 852 853 PMIX_EXPORT pmix_status_t PMIx_Group_invite_nb(const char grp[], 854 const pmix_proc_t procs[], size_t nprocs, 855 const pmix_info_t info[], size_t ninfo, 856 pmix_info_cbfunc_t cbfunc, void *cbdata); 857 858 /* Respond to an invitation to join a group that is being asynchronously constructed. 859 * 860 * The process must have registered for the PMIX_GROUP_INVITED event in order to be 861 * notified of the invitation. When ready to respond, the process provides a response 862 * using the appropriate form of PMIx_Group_join. 863 * 864 * Critical Note: Since the process is alerted to the invitation in a PMIx event handler, 865 * the process must not use the blocking form of this call unless it first “thread shifts” 866 * out of the handler and into its own thread context. Likewise, while it is safe to call 867 * the non-blocking form of the API from the event handler, the process must not block 868 * in the handler while waiting for the callback function to be called. 869 * 870 * Calling this function causes the group “leader” to be notified that the process has 871 * either accepted or declined the request. The blocking form of the API will return 872 * once the group has been completely constructed or the group’s construction has failed 873 * (as determined by the leader) – likewise, the callback function of the non-blocking 874 * form will be executed upon the same conditions. 875 * 876 * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event 877 * to be delivered to all participants so they can optionally declare a new leader. 878 * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in 879 * the results array in the return of the event handler. Only one process is 880 * allowed to return that attribute, declaring itself as the new leader. Results 881 * of the leader selection will be communicated to all participants via a 882 * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader 883 * was selected, then the status code provided in the event handler will provide 884 * an error value so the participants can take appropriate action. 885 * 886 * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed 887 * event handler will cause all participants to receive an event notifying them 888 * of that status. Similarly, the leader may elect to abort the procedure 889 * by either returning PMIX_GROUP_CONSTRUCT_ABORT from the handler assigned 890 * to the PMIX_GROUP_INVITE_ACCEPTED or PMIX_GROUP_INVITE_DECLINED codes, or 891 * by generating an event for the abort code. Abort events will be sent to 892 * all invited participants. 893 */ 894 PMIX_EXPORT pmix_status_t PMIx_Group_join(const char grp[], 895 const pmix_proc_t *leader, 896 pmix_group_opt_t opt, 897 const pmix_info_t info[], size_t ninfo, 898 pmix_info_t **results, size_t *nresult); 899 900 PMIX_EXPORT pmix_status_t PMIx_Group_join_nb(const char grp[], 901 const pmix_proc_t *leader, 902 pmix_group_opt_t opt, 903 const pmix_info_t info[], size_t ninfo, 904 pmix_info_cbfunc_t cbfunc, void *cbdata); 905 906 /* Leave a PMIx Group. Calls to PMIx_Group_leave (or its non-blocking form) will cause 907 * a PMIX_GROUP_LEFT event to be generated notifying all members of the group of the 908 * caller’s departure. The function will return (or the non-blocking function will 909 * execute the specified callback function) once the event has been locally generated 910 * and is not indicative of remote receipt. All PMIx-based collectives such as 911 * PMIx_Fence in action across the group will automatically be adjusted if the 912 * collective was called with the PMIX_GROUP_FT_COLLECTIVE attribute (default is 913 * false) – otherwise, the standard error return behavior will be provided. 914 * 915 * Critical Note: The PMIx_Group_leave API is intended solely for asynchronous 916 * departures of individual processes from a group as it is not a scalable 917 * operation – i.e., when a process determines it should no longer be a part of a 918 * defined group, but the remainder of the group retains a valid reason to continue 919 * in existence. Developers are advised to use PMIx_Group_destruct (or its 920 * non-blocking form) for all other scenarios as it represents a more scalable 921 * operation. 922 */ 923 PMIX_EXPORT pmix_status_t PMIx_Group_leave(const char grp[], 924 const pmix_info_t info[], size_t ninfo); 925 926 PMIX_EXPORT pmix_status_t PMIx_Group_leave_nb(const char grp[], 927 const pmix_info_t info[], size_t ninfo, 928 pmix_op_cbfunc_t cbfunc, void *cbdata); 929 930 /* Destruct a group identified by the provided group identifier. Both blocking and 931 * non-blocking versions are provided (the callback function for the non-blocking 932 * form will be called once all members of the group have called “destruct”). 933 * Processes may engage in multiple simultaneous group destruct operations as 934 * desired so long as each involves a unique group ID. The info array can be used 935 * to pass user-level directives regarding timeout constraints and other options 936 * available from the PMIx server. 937 * 938 * Some relevant attributes for this operation: 939 * 940 * PMIX_TIMEOUT (int): return an error if the group doesn’t destruct within the 941 * specified number of seconds. Targets the scenario where 942 * a process fails to call PMIx_Group_destruct due to hanging 943 * 944 * The destruct API will return an error if any group process fails or terminates 945 * prior to calling PMIx_Group_destruct or its non-blocking version unless the 946 * PMIX_GROUP_NOTIFY_TERMINATION attribute was provided (with a value of true) at 947 * time of group construction. If notification was requested, then a event will 948 * be delivered (using PMIX_GROUP_MEMBER_FAILED) for each process that fails to 949 * call destruct and the destruct tracker updated to account for the lack of 950 * participation. The PMIx_Group_destruct operation will subsequently return 951 * PMIX_SUCCESS when the remaining processes have all called destruct – i.e., the 952 * event will serve in place of return of an error. 953 */ 954 PMIX_EXPORT pmix_status_t PMIx_Group_destruct(const char grp[], 955 const pmix_info_t info[], size_t ninfo); 956 957 PMIX_EXPORT pmix_status_t PMIx_Group_destruct_nb(const char grp[], 958 const pmix_info_t info[], size_t ninfo, 959 pmix_op_cbfunc_t cbfunc, void *cbdata); 960 961 962 #if defined(c_plusplus) || defined(__cplusplus) 963 } 964 #endif 965 966 #endif