1
1
Gilles Gouaillardet 1c4a3598d0 pmix/pmix4x: refresh to the latest open PMIx master
refresh to openpmix/openpmix@ea3b29b1a4

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
2019-10-01 14:27:22 +09:00

986 строки
52 KiB
C
Исходник Ответственный История

Этот файл содержит неоднозначные символы Юникода

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer listed
* in this license in the documentation and/or other materials
* provided with the distribution.
*
* - Neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* The copyright holders provide no reassurances that the source code
* provided does not infringe any patent, copyright, or any other
* intellectual property rights of third parties. The copyright holders
* disclaim any liability to any recipient for claims brought against
* recipient by any third party for infringement of that parties
* intellectual property rights.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef PMIx_H
#define PMIx_H
/* Structure and constant definitions */
#include <pmix_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**** PMIX API ****/
/* Initialize the PMIx client, returning the process identifier assigned
* to this client's application in the provided pmix_proc_t struct.
* Passing a parameter of _NULL_ for this parameter is allowed if the user
* wishes solely to initialize the PMIx system and does not require
* return of the identifier at that time.
*
* When called the PMIx client will check for the required connection
* information of the local PMIx server and will establish the connection.
* If the information is not found, or the server connection fails, then
* an appropriate error constant will be returned.
*
* If successful, the function will return PMIX_SUCCESS and will fill the
* provided structure with the server-assigned namespace and rank of the
* process within the application.
*
* Note that the PMIx client library is referenced counted, and so multiple
* calls to PMIx_Init are allowed. Thus, one way to obtain the namespace and
* rank of the process is to simply call PMIx_Init with a non-NULL parameter.
*
* The info array is used to pass user requests pertaining to the init
* and subsequent operations. Pass a _NULL_ value for the array pointer
* is supported if no directives are desired.
*/
PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc,
pmix_info_t info[], size_t ninfo);
/* Finalize the PMIx client, closing the connection to the local server.
* An error code will be returned if, for some reason, the connection
* cannot be closed.
*
* The info array is used to pass user requests regarding the finalize
* operation. This can include:
*
* (a) PMIX_EMBED_BARRIER - By default, PMIx_Finalize does not include an
* internal barrier operation. This attribute directs PMIx_Finalize to
* execute a barrier as part of the finalize operation.
*/
PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo);
/* Returns _true_ if the PMIx client has been successfully initialized,
* returns _false_ otherwise. Note that the function only reports the
* internal state of the PMIx client - it does not verify an active
* connection with the server, nor that the server is functional. */
PMIX_EXPORT int PMIx_Initialized(void);
/* Request that the provided array of procs be aborted, returning the
* provided _status_ and printing the provided message. A _NULL_
* for the proc array indicates that all processes in the caller's
* nspace are to be aborted.
*
* The response to this request is somewhat dependent on the specific resource
* manager and its configuration (e.g., some resource managers will
* not abort the application if the provided _status_ is zero unless
* specifically configured to do so), and thus lies outside the control
* of PMIx itself. However, the client will inform the RM of
* the request that the application be aborted, regardless of the
* value of the provided _status_.
*
* Passing a _NULL_ msg parameter is allowed. Note that race conditions
* caused by multiple processes calling PMIx_Abort are left to the
* server implementation to resolve with regard to which status is
* returned and what messages (if any) are printed. */
PMIX_EXPORT pmix_status_t PMIx_Abort(int status, const char msg[],
pmix_proc_t procs[], size_t nprocs);
/* Push a value into the client's namespace. The client library will cache
* the information locally until _PMIx_Commit_ is called. The provided scope
* value is passed to the local PMIx server, which will distribute the data
* as directed. */
PMIX_EXPORT pmix_status_t PMIx_Put(pmix_scope_t scope, const pmix_key_t key, pmix_value_t *val);
/* Push all previously _PMIx_Put_ values to the local PMIx server.
* This is an asynchronous operation - the library will immediately
* return to the caller while the data is transmitted to the local
* server in the background */
PMIX_EXPORT pmix_status_t PMIx_Commit(void);
/* Execute a blocking barrier across the processes identified in the
* specified array. Passing a _NULL_ pointer as the _procs_ parameter
* indicates that the barrier is to span all processes in the client's
* namespace. Each provided pmix_proc_t struct can pass PMIX_RANK_WILDCARD to
* indicate that all processes in the given namespace are
* participating.
*
* The info array is used to pass user requests regarding the fence
* operation. This can include:
*
* (a) PMIX_COLLECT_DATA - a boolean indicating whether or not the barrier
* operation is to return the _put_ data from all participating processes.
* A value of _false_ indicates that the callback is just used as a release
* and no data is to be returned at that time. A value of _true_ indicates
* that all _put_ data is to be collected by the barrier. Returned data is
* cached at the server to reduce memory footprint, and can be retrieved
* as needed by calls to PMIx_Get(nb).
*
* Note that for scalability reasons, the default behavior for PMIx_Fence
* is to _not_ collect the data.
*
* (b) PMIX_COLLECTIVE_ALGO - a comma-delimited string indicating the algos
* to be used for executing the barrier, in priority order.
*
* (c) PMIX_COLLECTIVE_ALGO_REQD - instructs the host RM that it should return
* an error if none of the specified algos are available. Otherwise, the RM
* is to use one of the algos if possible, but is otherwise free to use any
* of its available methods to execute the operation.
*
* (d) PMIX_TIMEOUT - maximum time for the fence to execute before declaring
* an error. By default, the RM shall terminate the operation and notify participants
* if one or more of the indicated procs fails during the fence. However,
* the timeout parameter can help avoid "hangs" due to programming errors
* that prevent one or more procs from reaching the "fence".
*/
PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo);
/* Non-blocking version of PMIx_Fence. Note that the function will return
* an error if a _NULL_ callback function is given. */
PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Retrieve information for the specified _key_ as published by the process
* identified in the given pmix_proc_t, returning a pointer to the value in the
* given address.
*
* This is a blocking operation - the caller will block until
* the specified data has been _PMIx_Put_ by the specified rank. The caller is
* responsible for freeing all memory associated with the returned value when
* no longer required.
*
* The info array is used to pass user requests regarding the get
* operation. This can include:
*
* (a) PMIX_TIMEOUT - maximum time for the get to execute before declaring
* an error. The timeout parameter can help avoid "hangs" due to programming
* errors that prevent the target proc from ever exposing its data.
*/
PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[],
const pmix_info_t info[], size_t ninfo,
pmix_value_t **val);
/* A non-blocking operation version of PMIx_Get - the callback function will
* be executed once the specified data has been _PMIx_Put_
* by the identified process and retrieved by the local server. The info
* array is used as described above for the blocking form of this call. */
PMIX_EXPORT pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const pmix_key_t key,
const pmix_info_t info[], size_t ninfo,
pmix_value_cbfunc_t cbfunc, void *cbdata);
/* Publish the data in the info array for lookup. By default,
* the data will be published into the PMIX_SESSION range and
* with PMIX_PERSIST_APP persistence. Changes to those values,
* and any additional directives, can be included in the pmix_info_t
* array.
*
* Note that the keys must be unique within the specified
* data range or else an error will be returned (first published
* wins). Attempts to access the data by procs outside of
* the provided data range will be rejected.
*
* The persistence parameter instructs the server as to how long
* the data is to be retained.
*
* The blocking form will block until the server confirms that the
* data has been posted and is available. The non-blocking form will
* return immediately, executing the callback when the server confirms
* availability of the data.
*/
PMIX_EXPORT pmix_status_t PMIx_Publish(const pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Lookup information published by this or another process. By default,
* the search will be conducted across the PMIX_SESSION range. Changes
* to the range, and any additional directives, can be provided
* in the pmix_info_t array. Note that the search is also constrained
* to only data published by the current user ID - i.e., the search
* will not return data published by an application being executed
* by another user. There currently is no option to override this
* behavior - such an option may become available later via an
* appropriate pmix_info_t directive.
*
* The "data" parameter consists of an array of pmix_pdata_t struct with the
* keys specifying the requested information. Data will be returned
* for each key in the associated info struct - any key that cannot
* be found will return with a data type of "PMIX_UNDEF". The function
* will return SUCCESS if _any_ values can be found, so the caller
* must check each data element to ensure it was returned.
*
* The proc field in each pmix_pdata_t struct will contain the
* nspace/rank of the process that published the data.
*
* Note: although this is a blocking function, it will _not_ wait
* by default for the requested data to be published. Instead, it
* will block for the time required by the server to lookup its current
* data and return any found items. Thus, the caller is responsible for
* ensuring that data is published prior to executing a lookup, or
* for retrying until the requested data is found
*
* Optionally, the info array can be used to modify this behavior
* by including:
*
* (a) PMIX_WAIT - wait for the requested data to be published. The
* server is to wait until all data has become available.
*
* (b) PMIX_TIMEOUT - max time to wait for data to become available.
*
*/
PMIX_EXPORT pmix_status_t PMIx_Lookup(pmix_pdata_t data[], size_t ndata,
const pmix_info_t info[], size_t ninfo);
/* Non-blocking form of the _PMIx_Lookup_ function. Data for
* the provided NULL-terminated keys array will be returned
* in the provided callback function. As above, the default
* behavior is to _not_ wait for data to be published. The
* info keys can be used to modify the behavior as previously
* described */
PMIX_EXPORT pmix_status_t PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
/* Unpublish data posted by this process using the given keys.
* The function will block until the data has been removed by
* the server. A value of _NULL_ for the keys parameter instructs
* the server to remove _all_ data published by this process.
*
* By default, the range is assumed to be PMIX_SESSION. Changes
* to the range, and any additional directives, can be provided
* in the pmix_info_t array */
PMIX_EXPORT pmix_status_t PMIx_Unpublish(char **keys,
const pmix_info_t info[], size_t ninfo);
/* Non-blocking form of the _PMIx_Unpublish_ function. The
* callback function will be executed once the server confirms
* removal of the specified data. */
PMIX_EXPORT pmix_status_t PMIx_Unpublish_nb(char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Spawn a new job. The assigned namespace of the spawned applications
* is returned in the nspace parameter - a _NULL_ value in that
* location indicates that the caller doesn't wish to have the
* namespace returned. The nspace array must be at least of size
* PMIX_MAX_NSLEN+1. Behavior of individual resource managers
* may differ, but it is expected that failure of any application
* process to start will result in termination/cleanup of _all_
* processes in the newly spawned job and return of an error
* code to the caller.
*
* By default, the spawned processes will be PMIx "connected" to
* the parent process upon successful launch (see PMIx_Connect
* description for details). Note that this only means that the
* parent process (a) will be given a copy of the new job's
* information so it can query job-level info without
* incurring any communication penalties, and (b) will receive
* notification of errors from process in the child job.
*
* Job-level directives can be specified in the job_info array. This
* can include:
*
* (a) PMIX_NON_PMI - processes in the spawned job will
* not be calling PMIx_Init
*
* (b) PMIX_TIMEOUT - declare the spawn as having failed if the launched
* procs do not call PMIx_Init within the specified time
*
* (c) PMIX_NOTIFY_COMPLETION - notify the parent process when the
* child job terminates, either normally or with error
*/
PMIX_EXPORT pmix_status_t PMIx_Spawn(const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_nspace_t nspace);
/* Non-blocking form of the _PMIx_Spawn_ function. The callback
* will be executed upon launch of the specified applications,
* or upon failure to launch any of them. */
PMIX_EXPORT pmix_status_t PMIx_Spawn_nb(const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
/* Record the specified processes as "connected". Both blocking and non-blocking
* versions are provided. This means that the resource manager should treat the
* failure of any process in the specified group as a reportable event, and take
* appropriate action. Note that different resource managers may respond to
* failures in different manners.
*
* The callback function is to be called once all participating processes have
* called connect. The server is required to return any job-level info for the
* connecting processes that might not already have - i.e., if the connect
* request involves procs from different nspaces, then each proc shall receive
* the job-level info from those nspaces other than their own.
*
* Note: a process can only engage in _one_ connect operation involving the identical
* set of processes at a time. However, a process _can_ be simultaneously engaged
* in multiple connect operations, each involving a different set of processes
*
* As in the case of the fence operation, the info array can be used to pass
* user-level directives regarding the algorithm to be used for the collective
* operation involved in the "connect", timeout constraints, and other options
* available from the host RM */
PMIX_EXPORT pmix_status_t PMIx_Connect(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t PMIx_Connect_nb(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Disconnect a previously connected set of processes. An error will be returned
* if the specified set of procs was not previously "connected". As above, a process
* may be involved in multiple simultaneous disconnect operations. However, a process
* is not allowed to reconnect to a set of procs that has not fully completed
* disconnect - i.e., you have to fully disconnect before you can reconnect to the
* _same_ group of processes. The info array is used as above. */
PMIX_EXPORT pmix_status_t PMIx_Disconnect(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t PMIx_Disconnect_nb(const pmix_proc_t ranges[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Given a node name, return an array of processes within the specified nspace
* on that node. If the nspace is NULL, then all processes on the node will
* be returned. If the specified node does not currently host any processes,
* then the returned array will be NULL, and nprocs=0. The caller is responsible
* for releasing the array when done with it - the PMIX_PROC_FREE macro is
* provided for this purpose.
*/
PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename,
const pmix_nspace_t nspace,
pmix_proc_t **procs, size_t *nprocs);
/* Given an nspace, return the list of nodes hosting processes within
* that nspace. The returned string will contain a comma-delimited list
* of nodenames. The caller is responsible for releasing the string
* when done with it */
PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const pmix_nspace_t nspace, char **nodelist);
/* Query information about the system in general - can include
* a list of active nspaces, network topology, etc. Also can be
* used to query node-specific info such as the list of peers
* executing on a given node. We assume that the host RM will
* exercise appropriate access control on the information.
*
* The following return status codes are provided in the callback:
*
* PMIX_SUCCESS - all data has been returned
* PMIX_ERR_NOT_FOUND - none of the requested data was available
* PMIX_ERR_PARTIAL_SUCCESS - some of the data has been returned
* PMIX_ERR_NOT_SUPPORTED - the host RM does not support this function
*/
PMIX_EXPORT pmix_status_t PMIx_Query_info(pmix_query_t queries[], size_t nqueries,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Query_info_nb(pmix_query_t queries[], size_t nqueries,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Log data to a central data service/store, subject to the
* services offered by the host resource manager. The data to
* be logged is provided in the data array. The (optional) directives
* can be used to request specific storage options and direct
* the choice of storage option.
*
* The callback function will be executed when the log operation
* has been completed. The data array must be maintained until
* the callback is provided
*/
PMIX_EXPORT pmix_status_t PMIx_Log(const pmix_info_t data[], size_t ndata,
const pmix_info_t directives[], size_t ndirs);
PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Request an allocation operation from the host resource manager.
* Several broad categories are envisioned, including the ability to:
*
* - request allocation of additional resources, including memory,
* bandwidth, and compute. This should be accomplished in a
* non-blocking manner so that the application can continue to
* progress while waiting for resources to become available. Note
* that the new allocation will be disjoint from (i.e., not
* affiliated with) the allocation of the requestor - thus the
* termination of one allocation will not impact the other.
*
* - extend the reservation on currently allocated resources, subject
* to scheduling availability and priorities. This includes extending
* the time limit on current resources, and/or requesting additional
* resources be allocated to the requesting job. Any additional
* allocated resources will be considered as part of the current
* allocation, and thus will be released at the same time.
*
* - release currently allocated resources that are no longer required.
* This is intended to support partial release of resources since all
* resources are normally released upon termination of the job. The
* identified use-cases include resource variations across discrete steps
* of a workflow, as well as applications that spawn sub-jobs and/or
* dynamically grow/shrink over time
*
* - "lend" resources back to the scheduler with an expectation of getting
* them back at some later time in the job. This can be a proactive
* operation (e.g., to save on computing costs when resources are
* temporarily not required), or in response to scheduler requests in
* lieue of preemption. A corresponding ability to "reacquire" resources
* previously released is included.
*/
PMIX_EXPORT pmix_status_t PMIx_Allocation_request(pmix_alloc_directive_t directive,
pmix_info_t *info, size_t ninfo,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive,
pmix_info_t *info, size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request a job control action. The targets array identifies the
* processes to which the requested job control action is to be applied.
* A NULL value can be used to indicate all processes in the caller's
* nspace. The use of PMIX_RANK_WILDARD can also be used to indicate
* that all processes in the given nspace are to be included.
*
* The directives are provided as pmix_info_t structs in the directives
* array. The callback function provides a status to indicate whether or
* not the request was granted, and to provide some information as to
* the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t
* structures. If non-NULL, then the specified release_fn must be called
* when the callback function completes - this will be used to release
* any provided pmix_info_t array.
*/
PMIX_EXPORT pmix_status_t PMIx_Job_control(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request that something be monitored - e.g., that the server monitor
* this process for periodic heartbeats as an indication that the process
* has not become "wedged". When a monitor detects the specified alarm
* condition, it will generate an event notification using the provided
* error code and passing along any available relevant information. It is
* up to the caller to register a corresponding event handler.
*
* Params:
*
* monitor: attribute indicating the type of monitor being requested - e.g.,
* PMIX_MONITOR_FILE to indicate that the requestor is asking that
* a file be monitored.
*
* error: the status code to be used when generating an event notification
* alerting that the monitor has been triggered. The range of the
* notification defaults to PMIX_RANGE_NAMESPACE - this can be
* changed by providing a PMIX_RANGE directive
*
* directives: characterize the monitoring request (e.g., monitor file size)
* and frequency of checking to be done
*
* cbfunc: provides a status to indicate whether or not the request was granted,
* and to provide some information as to the reason for any denial in
* the pmix_info_cbfunc_t array of pmix_info_t structures.
*
* Note: a process can send a heartbeat to the server using the PMIx_Heartbeat
* macro provided below*/
PMIX_EXPORT pmix_status_t PMIx_Process_monitor(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* define a special macro to simplify sending of a heartbeat */
#define PMIx_Heartbeat() \
do { \
pmix_info_t _in; \
PMIX_INFO_CONSTRUCT(&_in); \
PMIX_INFO_LOAD(&_in, PMIX_SEND_HEARTBEAT, NULL, PMIX_POINTER); \
PMIx_Process_monitor_nb(&_in, PMIX_SUCCESS, NULL, 0, NULL, NULL); \
PMIX_INFO_DESTRUCT(&_in); \
} while(0)
/* Request a credential from the PMIx server/SMS.
* Input values include:
*
* info - an array of pmix_info_t structures containing any directives the
* caller may wish to pass. Typical usage might include:
* PMIX_TIMEOUT - how long to wait (in seconds) for a credential
* before timing out and returning an error
* PMIX_CRED_TYPE - a prioritized, comma-delimited list of desired
* credential types for use in environments where
* multiple authentication mechanisms may be
* available
*
* ninfo - number of elements in the info array
*
* cbfunc - the pmix_credential_cbfunc_t function to be called upon completion
* of the request
*
* cbdata - pointer to an object to be returned when cbfunc is called
*
* Returned values:
* PMIX_SUCCESS - indicates that the request has been successfully communicated to
* the local PMIx server. The response will be coming in the provided
* callback function.
*
* Any other value indicates an appropriate error condition. The callback function
* will _not_ be called in such cases.
*/
PMIX_EXPORT pmix_status_t PMIx_Get_credential(const pmix_info_t info[], size_t ninfo,
pmix_byte_object_t *credential);
PMIX_EXPORT pmix_status_t PMIx_Get_credential_nb(const pmix_info_t info[], size_t ninfo,
pmix_credential_cbfunc_t cbfunc, void *cbdata);
/* Request validation of a credential by the PMIx server/SMS
* Input values include:
*
* cred - pointer to a pmix_byte_object_t containing the credential
*
* info - an array of pmix_info_t structures containing any directives the
* caller may wish to pass. Typical usage might include:
* PMIX_TIMEOUT - how long to wait (in seconds) for validation
* before timing out and returning an error
* PMIX_USERID - the expected effective userid of the credential
* to be validated
* PMIX_GROUPID - the expected effective group id of the credential
* to be validated
*
* ninfo - number of elements in the info array
*
* cbfunc - the pmix_validation_cbfunc_t function to be called upon completion
* of the request
*
* cbdata - pointer to an object to be returned when cbfunc is called
*
* Returned values:
* PMIX_SUCCESS - indicates that the request has been successfully communicated to
* the local PMIx server. The response will be coming in the provided
* callback function.
*
* Any other value indicates an appropriate error condition. The callback function
* will _not_ be called in such cases.
*/
PMIX_EXPORT pmix_status_t PMIx_Validate_credential(const pmix_byte_object_t *cred,
const pmix_info_t info[], size_t ninfo,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Validate_credential_nb(const pmix_byte_object_t *cred,
const pmix_info_t info[], size_t ninfo,
pmix_validation_cbfunc_t cbfunc, void *cbdata);
/* Define a callback function for delivering forwarded IO to a process
* This function will be called whenever data becomes available, or a
* specified buffering size and/or time has been met. The function
* will be passed the following values:
*
* iofhdlr - the returned registration number of the handler being invoked.
* This is required when deregistering the handler.
*
* channel - a bitmask identifying the channel the data arrived on
*
* source - the nspace/rank of the process that generated the data
*
* payload - pointer to character array containing the data. Note that
* multiple strings may be included, and that the array may
* _not_ be NULL terminated
*
* info - an optional array of info provided by the source containing
* metadata about the payload. This could include PMIX_IOF_COMPLETE
*
* ninfo - number of elements in the optional info array
*/
typedef void (*pmix_iof_cbfunc_t)(size_t iofhdlr, pmix_iof_channel_t channel,
pmix_proc_t *source, char *payload,
pmix_info_t info[], size_t ninfo);
/* Register to receive output forwarded from a remote process.
*
* procs - array of identifiers for sources whose IO is being
* requested. Wildcard rank indicates that all procs
* in the specified nspace are included in the request
*
* nprocs - number of identifiers in the procs array
*
* directives - optional array of attributes to control the
* behavior of the request. For example, this
* might include directives on buffering IO
* before delivery, and/or directives to include
* or exclude any backlogged data
*
* ndirs - number of elements in the directives array
*
* channel - bitmask of IO channels included in the request.
* NOTE: STDIN is not supported as it will always
* be delivered to the stdin file descriptor
*
* cbfunc - function to be called when relevant IO is received
*
* regcbfunc - since registration is async, this is the
* function to be called when registration is
* completed. The function itself will return
* a non-success error if the registration cannot
* be submitted - in this case, the regcbfunc
* will _not_ be called.
* If regcbfunc is NULL, then this will be treated
* as a BLOCKING call - a positive return value
* represents the reference ID for the request,
* while negative values indicate the corresponding
* error
*
* cbdata - pointer to object to be returned in regcbfunc
*/
PMIX_EXPORT pmix_status_t PMIx_IOF_pull(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t directives[], size_t ndirs,
pmix_iof_channel_t channel, pmix_iof_cbfunc_t cbfunc,
pmix_hdlr_reg_cbfunc_t regcbfunc, void *regcbdata);
/* Deregister from output forwarded from a remote process.
*
* iofhdlr - the registration number returned from the
* call to PMIx_IOF_pull
*
* directives - optional array of attributes to control the
* behavior of the request. For example, this
* might include directives regarding what to
* do with any data currently in the IO buffer
* for this process
*
* cbfunc - function to be called when deregistration has
* been completed. Note that any IO to be flushed
* may continue to be received after deregistration
* has completed. If cbfunc is NULL, then this is
* treated as a BLOCKING call and the result of
* the operation will be provided in the returned status
*
* cbdata - pointer to object to be returned in cbfunc
*/
PMIX_EXPORT pmix_status_t PMIx_IOF_deregister(size_t iofhdlr,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Push data collected locally (typically from stdin) to
* stdin of target recipients.
*
* targets - array of process identifiers to which the data is to be delivered. Note
* that a WILDCARD rank indicates that all procs in the given nspace are
* to receive a copy of the data
*
* ntargets - number of procs in the targets array
*
* directives - optional array of attributes to control the
* behavior of the request. For example, this
* might include directives on buffering IO
* before delivery, and/or directives to include
* or exclude any backlogged data
*
* ndirs - number of elements in the directives array
*
* bo - pointer to a byte object containing the stdin data
*
* cbfunc - callback function when the data has been forwarded. If
* cbfunc is NULL, then this is treated as a BLOCKING call
* and the result of the operation will be provided in the
* returned status
*
* cbdata - object to be returned in cbfunc
*/
PMIX_EXPORT pmix_status_t PMIx_IOF_push(const pmix_proc_t targets[], size_t ntargets,
pmix_byte_object_t *bo,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Construct a new group composed of the specified processes and identified with
* the provided group identifier. Both blocking and non-blocking versions
* are provided (the callback function for the non-blocking form will be called
* once all specified processes have joined the group). The group identifier is
* a user-defined, NULL-terminated character array of length less than or equal
* to PMIX_MAX_NSLEN. Only characters accepted by standard string comparison
* functions (e.g., strncmp) are supported.
*
* Processes may engage in multiple simultaneous group construct operations as
* desired so long as each is provided with a unique group ID. The info array
* can be used to pass user-level directives regarding timeout constraints and
* other options available from the PMIx server.
*
* The construct leader (if PMIX_GROUP_LEADER is provided) or all participants
* will receive events (if registered for the PMIX_GROUP_MEMBER_FAILED event)
* whenever a process fails or terminates prior to calling
* PMIx_Group_construct(_nb) – the events will contain the identifier of the
* process that failed to join plus any other information that the resource
* manager provided. This provides an opportunity for the leader to react to
* the event – e.g., to invite an alternative member to the group or to decide
* to proceed with a smaller group. The decision to proceed with a smaller group
* is communicated to the PMIx library in the results array at the end of the
* event handler. This allows PMIx to properly adjust accounting for procedure
* completion. When construct is complete, the participating PMIx servers will
* be alerted to any change in participants and each group member will (if
* registered) receive a PMIX_GROUP_MEMBERSHIP_UPDATE event updating the group
* membership.
*
* Processes in a group under construction are not allowed to leave the group
* until group construction is complete. Upon completion of the construct
* procedure, each group member will have access to the job-level information
* of all nspaces represented in the group and the contact information for
* every group member.
*
* Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
* to be delivered to all participants so they can optionally declare a new leader.
* A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
* the results array in the return of the event handler. Only one process is
* allowed to return that attribute, declaring itself as the new leader. Results
* of the leader selection will be communicated to all participants via a
* PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
* was selected, then the status code provided in the event handler will provide
* an error value so the participants can take appropriate action.
*
* Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed
* event handler will cause the construct process to abort. Those processes
* engaged in the blocking construct will return from the call with the
* PMIX_GROUP_CONSTRUCT_ABORT status. Non-blocking participants will have
* their callback function executed with that status.
*
* Some relevant attributes for this operation:
* PMIX_GROUP_LEADER - declare this process to be the leader of the construction
* procedure. If a process provides this attribute, then
* failure notification for any participating process will
* go only to that one process. In the absence of a
* declared leader, failure events go to all participants.
* PMIX_GROUP_OPTIONAL - participation is optional - do not return an error if
* any of the specified processes terminate
* without having joined (default=false)
* PMIX_GROUP_NOTIFY_TERMINATION - notify remaining members when another member
* terminates without first leaving the
* group (default=false)
* PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context
* ID (size_t) to the group. The value is returned
* in the PMIX_GROUP_CONSTRUCT_COMPLETE event
* PMIX_TIMEOUT - return an error if the group doesn't assemble within the
* specified number of seconds. Targets the scenario where a
* process fails to call PMIx_Group_connect due to hanging
*
*/
PMIX_EXPORT pmix_status_t PMIx_Group_construct(const char grp[],
const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t directives[], size_t ndirs,
pmix_info_t **results, size_t *nresults);
PMIX_EXPORT pmix_status_t PMIx_Group_construct_nb(const char grp[],
const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Explicitly invite specified processes to join a group.
*
* Each invited process will be notified of the invitation via the PMIX_GROUP_INVITED
* event. The processes being invited must have registered for the PMIX_GROUP_INVITED
* event in order to be notified of the invitation. When ready to respond, each invited
* process provides a response using the appropriate form of PMIx_Group_join. This will
* notify the inviting process that the invitation was either accepted (via the
* PMIX_GROUP_INVITE_ACCEPTED event) or declined (via the PMIX_GROUP_INVITE_DECLINED event).
* The inviting process will also receive PMIX_GROUP_MEMBER_FAILED events whenever a
* process fails or terminates prior to responding to the invitation.
*
* Upon accepting the invitation, both the inviting and invited process will receive
* access to the job-level information of each others nspaces and the contact
* information of the other process.
*
* Some relevant attributes for this operation:
* PMIX_GROUP_ASSIGN_CONTEXT_ID - requests that the RM assign a unique context
* ID (size_t) to the group. The value is returned
* in the PMIX_GROUP_CONSTRUCT_COMPLETE event
* PMIX_TIMEOUT (int): return an error if the group doesnt assemble within the
* specified number of seconds. Targets the scenario where a
* process fails to call PMIx_Group_connect due to hanging
*
* The inviting process is automatically considered the leader of the asynchronous
* group construction procedure and will receive all failure or termination events
* for invited members prior to completion. The inviting process is required to
* provide a PMIX_GROUP_CONSTRUCT_COMPLETE event once the group has been fully
* assembled – this event will be distributed to all participants along with the
* final membership.
*
* Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
* to be delivered to all participants so they can optionally declare a new leader.
* A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
* the results array in the return of the event handler. Only one process is
* allowed to return that attribute, declaring itself as the new leader. Results
* of the leader selection will be communicated to all participants via a
* PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
* was selected, then the status code provided in the event handler will provide
* an error value so the participants can take appropriate action.
*
* Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the event
* handler will cause all participants to receive an event notifying them
* of that status.
*/
PMIX_EXPORT pmix_status_t PMIx_Group_invite(const char grp[],
const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_info_t **results, size_t *nresult);
PMIX_EXPORT pmix_status_t PMIx_Group_invite_nb(const char grp[],
const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Respond to an invitation to join a group that is being asynchronously constructed.
*
* The process must have registered for the PMIX_GROUP_INVITED event in order to be
* notified of the invitation. When ready to respond, the process provides a response
* using the appropriate form of PMIx_Group_join.
*
* Critical Note: Since the process is alerted to the invitation in a PMIx event handler,
* the process must not use the blocking form of this call unless it first “thread shifts”
* out of the handler and into its own thread context. Likewise, while it is safe to call
* the non-blocking form of the API from the event handler, the process must not block
* in the handler while waiting for the callback function to be called.
*
* Calling this function causes the group “leader” to be notified that the process has
* either accepted or declined the request. The blocking form of the API will return
* once the group has been completely constructed or the groups construction has failed
* (as determined by the leader) – likewise, the callback function of the non-blocking
* form will be executed upon the same conditions.
*
* Failure of the leader at any time will cause a PMIX_GROUP_LEADER_FAILED event
* to be delivered to all participants so they can optionally declare a new leader.
* A new leader is identified by providing the PMIX_GROUP_LEADER attribute in
* the results array in the return of the event handler. Only one process is
* allowed to return that attribute, declaring itself as the new leader. Results
* of the leader selection will be communicated to all participants via a
* PMIX_GROUP_LEADER_SELECTED event identifying the new leader. If no leader
* was selected, then the status code provided in the event handler will provide
* an error value so the participants can take appropriate action.
*
* Any participant that returns PMIX_GROUP_CONSTRUCT_ABORT from the leader failed
* event handler will cause all participants to receive an event notifying them
* of that status. Similarly, the leader may elect to abort the procedure
* by either returning PMIX_GROUP_CONSTRUCT_ABORT from the handler assigned
* to the PMIX_GROUP_INVITE_ACCEPTED or PMIX_GROUP_INVITE_DECLINED codes, or
* by generating an event for the abort code. Abort events will be sent to
* all invited participants.
*/
PMIX_EXPORT pmix_status_t PMIx_Group_join(const char grp[],
const pmix_proc_t *leader,
pmix_group_opt_t opt,
const pmix_info_t info[], size_t ninfo,
pmix_info_t **results, size_t *nresult);
PMIX_EXPORT pmix_status_t PMIx_Group_join_nb(const char grp[],
const pmix_proc_t *leader,
pmix_group_opt_t opt,
const pmix_info_t info[], size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Leave a PMIx Group. Calls to PMIx_Group_leave (or its non-blocking form) will cause
* a PMIX_GROUP_LEFT event to be generated notifying all members of the group of the
* callers departure. The function will return (or the non-blocking function will
* execute the specified callback function) once the event has been locally generated
* and is not indicative of remote receipt. All PMIx-based collectives such as
* PMIx_Fence in action across the group will automatically be adjusted if the
* collective was called with the PMIX_GROUP_FT_COLLECTIVE attribute (default is
* false) – otherwise, the standard error return behavior will be provided.
*
* Critical Note: The PMIx_Group_leave API is intended solely for asynchronous
* departures of individual processes from a group as it is not a scalable
* operation – i.e., when a process determines it should no longer be a part of a
* defined group, but the remainder of the group retains a valid reason to continue
* in existence. Developers are advised to use PMIx_Group_destruct (or its
* non-blocking form) for all other scenarios as it represents a more scalable
* operation.
*/
PMIX_EXPORT pmix_status_t PMIx_Group_leave(const char grp[],
const pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t PMIx_Group_leave_nb(const char grp[],
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Destruct a group identified by the provided group identifier. Both blocking and
* non-blocking versions are provided (the callback function for the non-blocking
* form will be called once all members of the group have called “destruct”).
* Processes may engage in multiple simultaneous group destruct operations as
* desired so long as each involves a unique group ID. The info array can be used
* to pass user-level directives regarding timeout constraints and other options
* available from the PMIx server.
*
* Some relevant attributes for this operation:
*
* PMIX_TIMEOUT (int): return an error if the group doesnt destruct within the
* specified number of seconds. Targets the scenario where
* a process fails to call PMIx_Group_destruct due to hanging
*
* The destruct API will return an error if any group process fails or terminates
* prior to calling PMIx_Group_destruct or its non-blocking version unless the
* PMIX_GROUP_NOTIFY_TERMINATION attribute was provided (with a value of true) at
* time of group construction. If notification was requested, then a event will
* be delivered (using PMIX_GROUP_MEMBER_FAILED) for each process that fails to
* call destruct and the destruct tracker updated to account for the lack of
* participation. The PMIx_Group_destruct operation will subsequently return
* PMIX_SUCCESS when the remaining processes have all called destruct – i.e., the
* event will serve in place of return of an error.
*/
PMIX_EXPORT pmix_status_t PMIx_Group_destruct(const char grp[],
const pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t PMIx_Group_destruct_nb(const char grp[],
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif