/* * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer listed * in this license in the documentation and/or other materials * provided with the distribution. * * - Neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * The copyright holders provide no reassurances that the source code * provided does not infringe any patent, copyright, or any other * intellectual property rights of third parties. The copyright holders * disclaim any liability to any recipient for claims brought against * recipient by any third party for infringement of that parties * intellectual property rights. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef PMIx_H #define PMIx_H /* Structure and constant definitions */ #include #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif /**** PMIX API ****/ /* Initialize the PMIx client, returning the process identifier assigned * to this client's application in the provided pmix_proc_t struct. * Passing a parameter of _NULL_ for this parameter is allowed if the user * wishes solely to initialize the PMIx system and does not require * return of the identifier at that time. * * When called the PMIx client will check for the required connection * information of the local PMIx server and will establish the connection. * If the information is not found, or the server connection fails, then * an appropriate error constant will be returned. * * If successful, the function will return PMIX_SUCCESS and will fill the * provided structure with the server-assigned namespace and rank of the * process within the application. * * Note that the PMIx client library is referenced counted, and so multiple * calls to PMIx_Init are allowed. Thus, one way to obtain the namespace and * rank of the process is to simply call PMIx_Init with a non-NULL parameter. * * The info array is used to pass user requests pertaining to the init * and subsequent operations. Pass a _NULL_ value for the array pointer * is supported if no directives are desired. */ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, pmix_info_t info[], size_t ninfo); /* Finalize the PMIx client, closing the connection to the local server. * An error code will be returned if, for some reason, the connection * cannot be closed. * * The info array is used to pass user requests regarding the finalize * operation. This can include: * * (a) PMIX_EMBED_BARRIER - By default, PMIx_Finalize does not include an * internal barrier operation. This attribute directs PMIx_Finalize to * execute a barrier as part of the finalize operation. */ PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo); /* Returns _true_ if the PMIx client has been successfully initialized, * returns _false_ otherwise. Note that the function only reports the * internal state of the PMIx client - it does not verify an active * connection with the server, nor that the server is functional. */ PMIX_EXPORT int PMIx_Initialized(void); /* Request that the provided array of procs be aborted, returning the * provided _status_ and printing the provided message. A _NULL_ * for the proc array indicates that all processes in the caller's * nspace are to be aborted. * * The response to this request is somewhat dependent on the specific resource * manager and its configuration (e.g., some resource managers will * not abort the application if the provided _status_ is zero unless * specifically configured to do so), and thus lies outside the control * of PMIx itself. However, the client will inform the RM of * the request that the application be aborted, regardless of the * value of the provided _status_. * * Passing a _NULL_ msg parameter is allowed. Note that race conditions * caused by multiple processes calling PMIx_Abort are left to the * server implementation to resolve with regard to which status is * returned and what messages (if any) are printed. */ PMIX_EXPORT pmix_status_t PMIx_Abort(int status, const char msg[], pmix_proc_t procs[], size_t nprocs); /* Push a value into the client's namespace. The client library will cache * the information locally until _PMIx_Commit_ is called. The provided scope * value is passed to the local PMIx server, which will distribute the data * as directed. */ PMIX_EXPORT pmix_status_t PMIx_Put(pmix_scope_t scope, const pmix_key_t key, pmix_value_t *val); /* Push all previously _PMIx_Put_ values to the local PMIx server. * This is an asynchronous operation - the library will immediately * return to the caller while the data is transmitted to the local * server in the background */ PMIX_EXPORT pmix_status_t PMIx_Commit(void); /* Execute a blocking barrier across the processes identified in the * specified array. Passing a _NULL_ pointer as the _procs_ parameter * indicates that the barrier is to span all processes in the client's * namespace. Each provided pmix_proc_t struct can pass PMIX_RANK_WILDCARD to * indicate that all processes in the given namespace are * participating. * * The info array is used to pass user requests regarding the fence * operation. This can include: * * (a) PMIX_COLLECT_DATA - a boolean indicating whether or not the barrier * operation is to return the _put_ data from all participating processes. * A value of _false_ indicates that the callback is just used as a release * and no data is to be returned at that time. A value of _true_ indicates * that all _put_ data is to be collected by the barrier. Returned data is * cached at the server to reduce memory footprint, and can be retrieved * as needed by calls to PMIx_Get(nb). * * Note that for scalability reasons, the default behavior for PMIx_Fence * is to _not_ collect the data. * * (b) PMIX_COLLECTIVE_ALGO - a comma-delimited string indicating the algos * to be used for executing the barrier, in priority order. * * (c) PMIX_COLLECTIVE_ALGO_REQD - instructs the host RM that it should return * an error if none of the specified algos are available. Otherwise, the RM * is to use one of the algos if possible, but is otherwise free to use any * of its available methods to execute the operation. * * (d) PMIX_TIMEOUT - maximum time for the fence to execute before declaring * an error. By default, the RM shall terminate the operation and notify participants * if one or more of the indicated procs fails during the fence. However, * the timeout parameter can help avoid "hangs" due to programming errors * that prevent one or more procs from reaching the "fence". */ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo); /* Non-blocking version of PMIx_Fence. Note that the function will return * an error if a _NULL_ callback function is given. */ PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Retrieve information for the specified _key_ as published by the process * identified in the given pmix_proc_t, returning a pointer to the value in the * given address. * * This is a blocking operation - the caller will block until * the specified data has been _PMIx_Put_ by the specified rank. The caller is * responsible for freeing all memory associated with the returned value when * no longer required. * * The info array is used to pass user requests regarding the get * operation. This can include: * * (a) PMIX_TIMEOUT - maximum time for the get to execute before declaring * an error. The timeout parameter can help avoid "hangs" due to programming * errors that prevent the target proc from ever exposing its data. */ PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[], const pmix_info_t info[], size_t ninfo, pmix_value_t **val); /* A non-blocking operation version of PMIx_Get - the callback function will * be executed once the specified data has been _PMIx_Put_ * by the identified process and retrieved by the local server. The info * array is used as described above for the blocking form of this call. */ PMIX_EXPORT pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const pmix_key_t key, const pmix_info_t info[], size_t ninfo, pmix_value_cbfunc_t cbfunc, void *cbdata); /* Publish the data in the info array for lookup. By default, * the data will be published into the PMIX_SESSION range and * with PMIX_PERSIST_APP persistence. Changes to those values, * and any additional directives, can be included in the pmix_info_t * array. * * Note that the keys must be unique within the specified * data range or else an error will be returned (first published * wins). Attempts to access the data by procs outside of * the provided data range will be rejected. * * The persistence parameter instructs the server as to how long * the data is to be retained. * * The blocking form will block until the server confirms that the * data has been posted and is available. The non-blocking form will * return immediately, executing the callback when the server confirms * availability of the data. */ PMIX_EXPORT pmix_status_t PMIx_Publish(const pmix_info_t info[], size_t ninfo); PMIX_EXPORT pmix_status_t PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Lookup information published by this or another process. By default, * the search will be conducted across the PMIX_SESSION range. Changes * to the range, and any additional directives, can be provided * in the pmix_info_t array. Note that the search is also constrained * to only data published by the current user ID - i.e., the search * will not return data published by an application being executed * by another user. There currently is no option to override this * behavior - such an option may become available later via an * appropriate pmix_info_t directive. * * The "data" parameter consists of an array of pmix_pdata_t struct with the * keys specifying the requested information. Data will be returned * for each key in the associated info struct - any key that cannot * be found will return with a data type of "PMIX_UNDEF". The function * will return SUCCESS if _any_ values can be found, so the caller * must check each data element to ensure it was returned. * * The proc field in each pmix_pdata_t struct will contain the * nspace/rank of the process that published the data. * * Note: although this is a blocking function, it will _not_ wait * by default for the requested data to be published. Instead, it * will block for the time required by the server to lookup its current * data and return any found items. Thus, the caller is responsible for * ensuring that data is published prior to executing a lookup, or * for retrying until the requested data is found * * Optionally, the info array can be used to modify this behavior * by including: * * (a) PMIX_WAIT - wait for the requested data to be published. The * server is to wait until all data has become available. * * (b) PMIX_TIMEOUT - max time to wait for data to become available. * */ PMIX_EXPORT pmix_status_t PMIx_Lookup(pmix_pdata_t data[], size_t ndata, const pmix_info_t info[], size_t ninfo); /* Non-blocking form of the _PMIx_Lookup_ function. Data for * the provided NULL-terminated keys array will be returned * in the provided callback function. As above, the default * behavior is to _not_ wait for data to be published. The * info keys can be used to modify the behavior as previously * described */ PMIX_EXPORT pmix_status_t PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); /* Unpublish data posted by this process using the given keys. * The function will block until the data has been removed by * the server. A value of _NULL_ for the keys parameter instructs * the server to remove _all_ data published by this process. * * By default, the range is assumed to be PMIX_SESSION. Changes * to the range, and any additional directives, can be provided * in the pmix_info_t array */ PMIX_EXPORT pmix_status_t PMIx_Unpublish(char **keys, const pmix_info_t info[], size_t ninfo); /* Non-blocking form of the _PMIx_Unpublish_ function. The * callback function will be executed once the server confirms * removal of the specified data. */ PMIX_EXPORT pmix_status_t PMIx_Unpublish_nb(char **keys, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Spawn a new job. The assigned namespace of the spawned applications * is returned in the nspace parameter - a _NULL_ value in that * location indicates that the caller doesn't wish to have the * namespace returned. The nspace array must be at least of size * PMIX_MAX_NSLEN+1. Behavior of individual resource managers * may differ, but it is expected that failure of any application * process to start will result in termination/cleanup of _all_ * processes in the newly spawned job and return of an error * code to the caller. * * By default, the spawned processes will be PMIx "connected" to * the parent process upon successful launch (see PMIx_Connect * description for details). Note that this only means that the * parent process (a) will be given a copy of the new job's * information so it can query job-level info without * incurring any communication penalties, and (b) will receive * notification of errors from process in the child job. * * Job-level directives can be specified in the job_info array. This * can include: * * (a) PMIX_NON_PMI - processes in the spawned job will * not be calling PMIx_Init * * (b) PMIX_TIMEOUT - declare the spawn as having failed if the launched * procs do not call PMIx_Init within the specified time * * (c) PMIX_NOTIFY_COMPLETION - notify the parent process when the * child job terminates, either normally or with error */ PMIX_EXPORT pmix_status_t PMIx_Spawn(const pmix_info_t job_info[], size_t ninfo, const pmix_app_t apps[], size_t napps, pmix_nspace_t nspace); /* Non-blocking form of the _PMIx_Spawn_ function. The callback * will be executed upon launch of the specified applications, * or upon failure to launch any of them. */ PMIX_EXPORT pmix_status_t PMIx_Spawn_nb(const pmix_info_t job_info[], size_t ninfo, const pmix_app_t apps[], size_t napps, pmix_spawn_cbfunc_t cbfunc, void *cbdata); /* Record the specified processes as "connected". Both blocking and non-blocking * versions are provided. This means that the resource manager should treat the * failure of any process in the specified group as a reportable event, and take * appropriate action. Note that different resource managers may respond to * failures in different manners. * * The callback function is to be called once all participating processes have * called connect. The server is required to return any job-level info for the * connecting processes that might not already have - i.e., if the connect * request involves procs from different nspaces, then each proc shall receive * the job-level info from those nspaces other than their own. * * Note: a process can only engage in _one_ connect operation involving the identical * set of processes at a time. However, a process _can_ be simultaneously engaged * in multiple connect operations, each involving a different set of processes * * As in the case of the fence operation, the info array can be used to pass * user-level directives regarding the algorithm to be used for the collective * operation involved in the "connect", timeout constraints, and other options * available from the host RM */ PMIX_EXPORT pmix_status_t PMIx_Connect(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo); PMIX_EXPORT pmix_status_t PMIx_Connect_nb(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Disconnect a previously connected set of processes. An error will be returned * if the specified set of procs was not previously "connected". As above, a process * may be involved in multiple simultaneous disconnect operations. However, a process * is not allowed to reconnect to a set of procs that has not fully completed * disconnect - i.e., you have to fully disconnect before you can reconnect to the * _same_ group of processes. The info array is used as above. */ PMIX_EXPORT pmix_status_t PMIx_Disconnect(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo); PMIX_EXPORT pmix_status_t PMIx_Disconnect_nb(const pmix_proc_t ranges[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Given a node name, return an array of processes within the specified nspace * on that node. If the nspace is NULL, then all processes on the node will * be returned. If the specified node does not currently host any processes, * then the returned array will be NULL, and nprocs=0. The caller is responsible * for releasing the array when done with it - the PMIX_PROC_FREE macro is * provided for this purpose. */ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, const pmix_nspace_t nspace, pmix_proc_t **procs, size_t *nprocs); /* Given an nspace, return the list of nodes hosting processes within * that nspace. The returned string will contain a comma-delimited list * of nodenames. The caller is responsible for releasing the string * when done with it */ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char **nodelist); /* Query information about the system in general - can include * a list of active nspaces, network topology, etc. Also can be * used to query node-specific info such as the list of peers * executing on a given node. We assume that the host RM will * exercise appropriate access control on the information. * * The following return status codes are provided in the callback: * * PMIX_SUCCESS - all data has been returned * PMIX_ERR_NOT_FOUND - none of the requested data was available * PMIX_ERR_PARTIAL_SUCCESS - some of the data has been returned * PMIX_ERR_NOT_SUPPORTED - the host RM does not support this function */ PMIX_EXPORT pmix_status_t PMIx_Query_info(pmix_query_t queries[], size_t nqueries, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Query_info_nb(pmix_query_t queries[], size_t nqueries, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Log data to a central data service/store, subject to the * services offered by the host resource manager. The data to * be logged is provided in the data array. The (optional) directives * can be used to request specific storage options and direct * the choice of storage option. * * The callback function will be executed when the log operation * has been completed. The data array must be maintained until * the callback is provided */ PMIX_EXPORT pmix_status_t PMIx_Log(const pmix_info_t data[], size_t ndata, const pmix_info_t directives[], size_t ndirs); PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata, const pmix_info_t directives[], size_t ndirs, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Request an allocation operation from the host resource manager. * Several broad categories are envisioned, including the ability to: * * - request allocation of additional resources, including memory, * bandwidth, and compute. This should be accomplished in a * non-blocking manner so that the application can continue to * progress while waiting for resources to become available. Note * that the new allocation will be disjoint from (i.e., not * affiliated with) the allocation of the requestor - thus the * termination of one allocation will not impact the other. * * - extend the reservation on currently allocated resources, subject * to scheduling availability and priorities. This includes extending * the time limit on current resources, and/or requesting additional * resources be allocated to the requesting job. Any additional * allocated resources will be considered as part of the current * allocation, and thus will be released at the same time. * * - release currently allocated resources that are no longer required. * This is intended to support partial release of resources since all * resources are normally released upon termination of the job. The * identified use-cases include resource variations across discrete steps * of a workflow, as well as applications that spawn sub-jobs and/or * dynamically grow/shrink over time * * - "lend" resources back to the scheduler with an expectation of getting * them back at some later time in the job. This can be a proactive * operation (e.g., to save on computing costs when resources are * temporarily not required), or in response to scheduler requests in * lieue of preemption. A corresponding ability to "reacquire" resources * previously released is included. */ PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive, pmix_info_t *info, size_t ninfo, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive, pmix_info_t *info, size_t ninfo, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Request a job control action. The targets array identifies the * processes to which the requested job control action is to be applied. * A NULL value can be used to indicate all processes in the caller's * nspace. The use of PMIX_RANK_WILDARD can also be used to indicate * that all processes in the given nspace are to be included. * * The directives are provided as pmix_info_t structs in the directives * array. The callback function provides a status to indicate whether or * not the request was granted, and to provide some information as to * the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t * structures. If non-NULL, then the specified release_fn must be called * when the callback function completes - this will be used to release * any provided pmix_info_t array. */ PMIX_EXPORT pmix_status_t PMIx_Job_control(const pmix_proc_t targets[], size_t ntargets, const pmix_info_t directives[], size_t ndirs, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets, const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Request that something be monitored - e.g., that the server monitor * this process for periodic heartbeats as an indication that the process * has not become "wedged". When a monitor detects the specified alarm * condition, it will generate an event notification using the provided * error code and passing along any available relevant information. It is * up to the caller to register a corresponding event handler. * * Params: * * monitor: attribute indicating the type of monitor being requested - e.g., * PMIX_MONITOR_FILE to indicate that the requestor is asking that * a file be monitored. * * error: the status code to be used when generating an event notification * alerting that the monitor has been triggered. The range of the * notification defaults to PMIX_RANGE_NAMESPACE - this can be * changed by providing a PMIX_RANGE directive * * directives: characterize the monitoring request (e.g., monitor file size) * and frequency of checking to be done * * cbfunc: provides a status to indicate whether or not the request was granted, * and to provide some information as to the reason for any denial in * the pmix_info_cbfunc_t array of pmix_info_t structures. * * Note: a process can send a heartbeat to the server using the PMIx_Heartbeat * macro provided below*/ PMIX_EXPORT pmix_status_t PMIx_Process_monitor(const pmix_info_t *monitor, pmix_status_t error, const pmix_info_t directives[], size_t ndirs, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error, const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata); /* define a special macro to simplify sending of a heartbeat */ #define PMIx_Heartbeat() \ do { \ pmix_info_t _in; \ PMIX_INFO_CONSTRUCT(&_in); \ PMIX_INFO_LOAD(&_in, PMIX_SEND_HEARTBEAT, NULL, PMIX_POINTER); \ PMIx_Process_monitor_nb(&_in, PMIX_SUCCESS, NULL, 0, NULL, NULL); \ PMIX_INFO_DESTRUCT(&_in); \ } while(0) /* Request a credential from the PMIx server/SMS. * Input values include: * * info - an array of pmix_info_t structures containing any directives the * caller may wish to pass. Typical usage might include: * PMIX_TIMEOUT - how long to wait (in seconds) for a credential * before timing out and returning an error * PMIX_CRED_TYPE - a prioritized, comma-delimited list of desired * credential types for use in environments where * multiple authentication mechanisms may be * available * * ninfo - number of elements in the info array * * cbfunc - the pmix_credential_cbfunc_t function to be called upon completion * of the request * * cbdata - pointer to an object to be returned when cbfunc is called * * Returned values: * PMIX_SUCCESS - indicates that the request has been successfully communicated to * the local PMIx server. The response will be coming in the provided * callback function. * * Any other value indicates an appropriate error condition. The callback function * will _not_ be called in such cases. */ PMIX_EXPORT pmix_status_t PMIx_Get_credential(const pmix_info_t info[], size_t ninfo, pmix_byte_object_t *credential); PMIX_EXPORT pmix_status_t PMIx_Get_credential_nb(const pmix_info_t info[], size_t ninfo, pmix_credential_cbfunc_t cbfunc, void *cbdata); /* Request validation of a credential by the PMIx server/SMS * Input values include: * * cred - pointer to a pmix_byte_object_t containing the credential * * info - an array of pmix_info_t structures containing any directives the * caller may wish to pass. Typical usage might include: * PMIX_TIMEOUT - how long to wait (in seconds) for validation * before timing out and returning an error * PMIX_USERID - the expected effective userid of the credential * to be validated * PMIX_GROUPID - the expected effective group id of the credential * to be validated * * ninfo - number of elements in the info array * * cbfunc - the pmix_validation_cbfunc_t function to be called upon completion * of the request * * cbdata - pointer to an object to be returned when cbfunc is called * * Returned values: * PMIX_SUCCESS - indicates that the request has been successfully communicated to * the local PMIx server. The response will be coming in the provided * callback function. * * Any other value indicates an appropriate error condition. The callback function * will _not_ be called in such cases. */ PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cred, const pmix_info_t info[], size_t ninfo, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Validate_credential_nb(const pmix_byte_object_t *cred, const pmix_info_t info[], size_t ninfo, pmix_validation_cbfunc_t cbfunc, void *cbdata); /* Define a callback function for delivering forwarded IO to a process * This function will be called whenever data becomes available, or a * specified buffering size and/or time has been met. The function * will be passed the following values: * * iofhdlr - the returned registration number of the handler being invoked. * This is required when deregistering the handler. * * channel - a bitmask identifying the channel the data arrived on * * source - the nspace/rank of the process that generated the data * * payload - pointer to character array containing the data. Note that * multiple strings may be included, and that the array may * _not_ be NULL terminated * * info - an optional array of info provided by the source containing * metadata about the payload. This could include PMIX_IOF_COMPLETE * * ninfo - number of elements in the optional info array */ typedef void (*pmix_iof_cbfunc_t)(size_t iofhdlr, pmix_iof_channel_t channel, pmix_proc_t *source, char *payload, pmix_info_t info[], size_t ninfo); /* Register to receive output forwarded from a remote process. * * procs - array of identifiers for sources whose IO is being * requested. Wildcard rank indicates that all procs * in the specified nspace are included in the request * * nprocs - number of identifiers in the procs array * * directives - optional array of attributes to control the * behavior of the request. For example, this * might include directives on buffering IO * before delivery, and/or directives to include * or exclude any backlogged data * * ndirs - number of elements in the directives array * * channel - bitmask of IO channels included in the request. * NOTE: STDIN is not supported as it will always * be delivered to the stdin file descriptor * * cbfunc - function to be called when relevant IO is received * * regcbfunc - since registration is async, this is the * function to be called when registration is * completed. The function itself will return * a non-success error if the registration cannot * be submitted - in this case, the regcbfunc * will _not_ be called. * If regcbfunc is NULL, then this will be treated * as a BLOCKING call - a positive return value * represents the reference ID for the request, * while negative values indicate the corresponding * error * * cbdata - pointer to object to be returned in regcbfunc */ PMIX_EXPORT pmix_status_t PMIx_IOF_pull(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t directives[], size_t ndirs, pmix_iof_channel_t channel, pmix_iof_cbfunc_t cbfunc, pmix_hdlr_reg_cbfunc_t regcbfunc, void *regcbdata); /* Deregister from output forwarded from a remote process. * * iofhdlr - the registration number returned from the * call to PMIx_IOF_pull * * directives - optional array of attributes to control the * behavior of the request. For example, this * might include directives regarding what to * do with any data currently in the IO buffer * for this process * * cbfunc - function to be called when deregistration has * been completed. Note that any IO to be flushed * may continue to be received after deregistration * has completed. If cbfunc is NULL, then this is * treated as a BLOCKING call and the result of * the operation will be provided in the returned status * * cbdata - pointer to object to be returned in cbfunc */ PMIX_EXPORT pmix_status_t PMIx_IOF_deregister(size_t iofhdlr, const pmix_info_t directives[], size_t ndirs, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Push data collected locally (typically from stdin) to * stdin of target recipients. * * targets - array of process identifiers to which the data is to be delivered. Note * that a WILDCARD rank indicates that all procs in the given nspace are * to receive a copy of the data * * ntargets - number of procs in the targets array * * directives - optional array of attributes to control the * behavior of the request. For example, this * might include directives on buffering IO * before delivery, and/or directives to include * or exclude any backlogged data * * ndirs - number of elements in the directives array * * bo - pointer to a byte object containing the stdin data * * cbfunc - callback function when the data has been forwarded. If * cbfunc is NULL, then this is treated as a BLOCKING call * and the result of the operation will be provided in the * returned status * * cbdata - object to be returned in cbfunc */ PMIX_EXPORT pmix_status_t PMIx_IOF_push(const pmix_proc_t targets[], size_t ntargets, pmix_byte_object_t *bo, const pmix_info_t directives[], size_t ndirs, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Construct a new group composed of the specified processes and identified with * the provided group identifier. Both blocking and non-blocking versions * are provided (the callback function for the non-blocking form will be called * once all specified processes have joined the group). The group identifier is * a user-defined, NULL-terminated character array of length less than or equal * to PMIX_MAX_NSLEN. Only characters accepted by standard string comparison * functions (e.g., strncmp) are supported. * * Processes may engage in multiple simultaneous group construct operations as * desired so long as each is provided with a unique group ID. The info array * can be used to pass user-level directives regarding timeout constraints and * other options available from the PMIx server. * * The construct leader (if PMIX_GROUP_LEADER is provided) or all participants * will receive events (if registered for the PMIX_GROUP_MEMBER_FAILED event) * whenever a process fails or terminates prior to calling * PMIx_Group_construct(_nb) – the events will contain the identifier of the * process that failed to join plus any other information that the resource * manager provided. This provides an opportunity for the leader to react to * the event – e.g., to invite an alternative member to the group or to decide * to proceed with a smaller group. The decision to proceed with a smaller group * is communicated to the PMIx library in the results array at the end of the * event handler. This allows PMIx to properly adjust accounting for procedure * completion. When construct is complete, the participating PMIx servers will * be alerted to any change in participants and each group member will (if * registered) receive a PMIX_GROUP_MEMBERSHIP_UPDATE event updating the group * membership. * * Processes in a group under construction are not allowed to leave the group * until group construction is complete. Upon completion of the construct * procedure, each group member will have access to the job-level information * of all nspaces represented in the group and the contact information for * every group member. * * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event * to be delivered to all participants so they can optionally declare a new leader. * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in * the results array in the return of the event handler. Only one process is * allowed to return that attribute, declaring itself as the new leader. Results * of the leader selection will be communicated to all participants via a * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader * was selected, then the status code provided in the event handler will provide * an error value so the participants can take appropriate action. * * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed * event handler will cause the construct process to abort. Those processes * engaged in the blocking construct will return from the call with the * PMIX_GROUP_CONSTRUCT_ABORT status. Non-blocking participants will have * their callback function executed with that status. * * Some relevant attributes for this operation: * PMIX_GROUP_LEADER - declare this process to be the leader of the construction * procedure. If a process provides this attribute, then * failure notification for any participating process will * go only to that one process. In the absence of a * declared leader, failure events go to all participants. * PMIX_GROUP_OPTIONAL - participation is optional - do not return an error if * any of the specified processes terminate * without having joined (default=false) * PMIX_GROUP_NOTIFY_TERMINATION - notify remaining members when another member * terminates without first leaving the * group (default=false) * PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context * ID (size_t) to the group. The value is returned * in the PMIX_GROUP_CONSTRUCT_COMPLETE event * PMIX_TIMEOUT - return an error if the group doesn't assemble within the * specified number of seconds. Targets the scenario where a * process fails to call PMIx_Group_connect due to hanging * */ PMIX_EXPORT pmix_status_t PMIx_Group_construct(const char grp[], const pmix_proc_t procs[], size_t nprocs, const pmix_info_t directives[], size_t ndirs, pmix_info_t **results, size_t *nresults); PMIX_EXPORT pmix_status_t PMIx_Group_construct_nb(const char grp[], const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Explicitly invite specified processes to join a group. * * Each invited process will be notified of the invitation via the PMIX_GROUP_INVITED * event. The processes being invited must have registered for the PMIX_GROUP_INVITED * event in order to be notified of the invitation. When ready to respond, each invited * process provides a response using the appropriate form of PMIx_Group_join. This will * notify the inviting process that the invitation was either accepted (via the * PMIX_GROUP_INVITE_ACCEPTED event) or declined (via the PMIX_GROUP_INVITE_DECLINED event). * The inviting process will also receive PMIX_GROUP_MEMBER_FAILED events whenever a * process fails or terminates prior to responding to the invitation. * * Upon accepting the invitation, both the inviting and invited process will receive * access to the job-level information of each other’s nspaces and the contact * information of the other process. * * Some relevant attributes for this operation: * PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context * ID (size_t) to the group. The value is returned * in the PMIX_GROUP_CONSTRUCT_COMPLETE event * PMIX_TIMEOUT (int): return an error if the group doesn’t assemble within the * specified number of seconds. Targets the scenario where a * process fails to call PMIx_Group_connect due to hanging * * The inviting process is automatically considered the leader of the asynchronous * group construction procedure and will receive all failure or termination events * for invited members prior to completion. The inviting process is required to * provide a PMIX_GROUP_CONSTRUCT_COMPLETE event once the group has been fully * assembled – this event will be distributed to all participants along with the * final membership. * * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event * to be delivered to all participants so they can optionally declare a new leader. * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in * the results array in the return of the event handler. Only one process is * allowed to return that attribute, declaring itself as the new leader. Results * of the leader selection will be communicated to all participants via a * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader * was selected, then the status code provided in the event handler will provide * an error value so the participants can take appropriate action. * * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the event * handler will cause all participants to receive an event notifying them * of that status. */ PMIX_EXPORT pmix_status_t PMIx_Group_invite(const char grp[], const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_info_t **results, size_t *nresult); PMIX_EXPORT pmix_status_t PMIx_Group_invite_nb(const char grp[], const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Respond to an invitation to join a group that is being asynchronously constructed. * * The process must have registered for the PMIX_GROUP_INVITED event in order to be * notified of the invitation. When ready to respond, the process provides a response * using the appropriate form of PMIx_Group_join. * * Critical Note: Since the process is alerted to the invitation in a PMIx event handler, * the process must not use the blocking form of this call unless it first “thread shifts” * out of the handler and into its own thread context. Likewise, while it is safe to call * the non-blocking form of the API from the event handler, the process must not block * in the handler while waiting for the callback function to be called. * * Calling this function causes the group “leader” to be notified that the process has * either accepted or declined the request. The blocking form of the API will return * once the group has been completely constructed or the group’s construction has failed * (as determined by the leader) – likewise, the callback function of the non-blocking * form will be executed upon the same conditions. * * Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event * to be delivered to all participants so they can optionally declare a new leader. * A new leader is identified by providing the PMIX_GROUP_LEADER attribute in * the results array in the return of the event handler. Only one process is * allowed to return that attribute, declaring itself as the new leader. Results * of the leader selection will be communicated to all participants via a * PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader * was selected, then the status code provided in the event handler will provide * an error value so the participants can take appropriate action. * * Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed * event handler will cause all participants to receive an event notifying them * of that status. Similarly, the leader may elect to abort the procedure * by either returning PMIX_GROUP_CONSTRUCT_ABORT from the handler assigned * to the PMIX_GROUP_INVITE_ACCEPTED or PMIX_GROUP_INVITE_DECLINED codes, or * by generating an event for the abort code. Abort events will be sent to * all invited participants. */ PMIX_EXPORT pmix_status_t PMIx_Group_join(const char grp[], const pmix_proc_t *leader, pmix_group_opt_t opt, const pmix_info_t info[], size_t ninfo, pmix_info_t **results, size_t *nresult); PMIX_EXPORT pmix_status_t PMIx_Group_join_nb(const char grp[], const pmix_proc_t *leader, pmix_group_opt_t opt, const pmix_info_t info[], size_t ninfo, pmix_info_cbfunc_t cbfunc, void *cbdata); /* Leave a PMIx Group. Calls to PMIx_Group_leave (or its non-blocking form) will cause * a PMIX_GROUP_LEFT event to be generated notifying all members of the group of the * caller’s departure. The function will return (or the non-blocking function will * execute the specified callback function) once the event has been locally generated * and is not indicative of remote receipt. All PMIx-based collectives such as * PMIx_Fence in action across the group will automatically be adjusted if the * collective was called with the PMIX_GROUP_FT_COLLECTIVE attribute (default is * false) – otherwise, the standard error return behavior will be provided. * * Critical Note: The PMIx_Group_leave API is intended solely for asynchronous * departures of individual processes from a group as it is not a scalable * operation – i.e., when a process determines it should no longer be a part of a * defined group, but the remainder of the group retains a valid reason to continue * in existence. Developers are advised to use PMIx_Group_destruct (or its * non-blocking form) for all other scenarios as it represents a more scalable * operation. */ PMIX_EXPORT pmix_status_t PMIx_Group_leave(const char grp[], const pmix_info_t info[], size_t ninfo); PMIX_EXPORT pmix_status_t PMIx_Group_leave_nb(const char grp[], const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Destruct a group identified by the provided group identifier. Both blocking and * non-blocking versions are provided (the callback function for the non-blocking * form will be called once all members of the group have called “destruct”). * Processes may engage in multiple simultaneous group destruct operations as * desired so long as each involves a unique group ID. The info array can be used * to pass user-level directives regarding timeout constraints and other options * available from the PMIx server. * * Some relevant attributes for this operation: * * PMIX_TIMEOUT (int): return an error if the group doesn’t destruct within the * specified number of seconds. Targets the scenario where * a process fails to call PMIx_Group_destruct due to hanging * * The destruct API will return an error if any group process fails or terminates * prior to calling PMIx_Group_destruct or its non-blocking version unless the * PMIX_GROUP_NOTIFY_TERMINATION attribute was provided (with a value of true) at * time of group construction. If notification was requested, then a event will * be delivered (using PMIX_GROUP_MEMBER_FAILED) for each process that fails to * call destruct and the destruct tracker updated to account for the lack of * participation. The PMIx_Group_destruct operation will subsequently return * PMIX_SUCCESS when the remaining processes have all called destruct – i.e., the * event will serve in place of return of an error. */ PMIX_EXPORT pmix_status_t PMIx_Group_destruct(const char grp[], const pmix_info_t info[], size_t ninfo); PMIX_EXPORT pmix_status_t PMIx_Group_destruct_nb(const char grp[], const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); #if defined(c_plusplus) || defined(__cplusplus) } #endif #endif