1
1
openmpi/opal/mca/pmix/pmix_server.h

273 строки
15 KiB
C

/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_PMIX_SERVER_H
#define OPAL_PMIX_SERVER_H
#include "opal_config.h"
#include "opal/types.h"
#include "opal/mca/pmix/pmix_types.h"
BEGIN_C_DECLS
/**** SERVER FUNCTION-SHIPPED APIs ****/
/* NOTE: for performance purposes, the host server is required to
* return as quickly as possible from all functions. Execution of
* the function is thus to be done asynchronously so as to allow
* the server support library to handle multiple client requests
* as quickly and scalably as possible.
*
* ALL data passed to the host server functions is "owned" by the
* server support library and MUST NOT be free'd. Data returned
* by the host server via callback function is owned by the host
* server, which is free to release it upon return from the callback */
/* Notify the host server that a client connected to us */
typedef int (*opal_pmix_server_client_connected_fn_t)(opal_process_name_t *proc,
void* server_object,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
/* Notify the host server that a client called pmix.finalize - note
* that the client will be in a blocked state until the host server
* executes the callback function, thus allowing the server support
* library to release the client */
typedef int (*opal_pmix_server_client_finalized_fn_t)(opal_process_name_t *proc, void* server_object,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* A local client called pmix.abort - note that the client will be in a blocked
* state until the host server executes the callback function, thus
* allowing the server support library to release the client. The
* list of procs_to_abort indicates which processes are to be terminated. A NULL
* indicates that all procs in the client's nspace are to be terminated */
typedef int (*opal_pmix_server_abort_fn_t)(opal_process_name_t *proc, void *server_object,
int status, const char msg[],
opal_list_t *procs_to_abort,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* At least one client called either pmix.fence or pmix.fence_nb. In either case,
* the host server will be called via a non-blocking function to execute
* the specified operation once all participating local procs have
* contributed. All processes in the specified list are required to participate
* in the fence[_nb] operation. The callback is to be executed once each daemon
* hosting at least one participant has called the host server's fencenb function.
*
* The list of opal_value_t includes any directives from the user regarding
* how the fence is to be executed (e.g., timeout limits).
*
* The provided data is to be collectively shared with all host
* servers involved in the fence operation, and returned in the modex
* cbfunc. A _NULL_ data value indicates that the local procs had
* no data to contribute */
typedef int (*opal_pmix_server_fencenb_fn_t)(opal_list_t *procs, opal_list_t *info,
char *data, size_t ndata,
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
/* Used by the PMIx server to request its local host contact the
* PMIx server on the remote node that hosts the specified proc to
* obtain and return a direct modex blob for that proc
*
* The list of opal_value_t includes any directives from the user regarding
* how the operation is to be executed (e.g., timeout limits).
*/
typedef int (*opal_pmix_server_dmodex_req_fn_t)(opal_process_name_t *proc, opal_list_t *info,
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
/* Publish data per the PMIx API specification. The callback is to be executed
* upon completion of the operation. The host server is not required to guarantee
* support for the requested scope - i.e., the server does not need to return an
* error if the data store doesn't support scope-based isolation. However, the
* server must return an error (a) if the key is duplicative within the storage
* scope, and (b) if the server does not allow overwriting of published info by
* the original publisher - it is left to the discretion of the host server to
* allow info-key-based flags to modify this behavior. The persist flag indicates
* how long the server should retain the data. The nspace/rank of the publishing
* process is also provided and is expected to be returned on any subsequent
* lookup request */
typedef int (*opal_pmix_server_publish_fn_t)(opal_process_name_t *proc,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Lookup published data. The host server will be passed a NULL-terminated array
* of string keys along with the scope within which the data is expected to have
* been published. The host server is not required to guarantee support for all
* PMIx-defined scopes, but should only search data stores within the specified
* scope within the context of the corresponding "publish" API. The wait flag
* indicates whether the server should wait for all data to become available
* before executing the callback function, or should callback with whatever
* data is immediately available.
*
* The list of opal_value_t includes any directives from the user regarding
* how the operation is to be executed (e.g., timeout limits, whether the
* lookup should wait until data appears).
*/
typedef int (*opal_pmix_server_lookup_fn_t)(opal_process_name_t *proc, char **keys,
opal_list_t *info,
opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata);
/* Delete data from the data store. The host server will be passed a NULL-terminated array
* of string keys along with the scope within which the data is expected to have
* been published. The callback is to be executed upon completion of the delete
* procedure */
typedef int (*opal_pmix_server_unpublish_fn_t)(opal_process_name_t *proc, char **keys,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Spawn a set of applications/processes as per the PMIx API. Note that
* applications are not required to be MPI or any other programming model.
* Thus, the host server cannot make any assumptions as to their required
* support. The callback function is to be executed once all processes have
* been started. An error in starting any application or process in this
* request shall cause all applications and processes in the request to
* be terminated, and an error returned to the originating caller */
typedef int (*opal_pmix_server_spawn_fn_t)(opal_process_name_t *requestor,
opal_list_t *job_info, opal_list_t *apps,
opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata);
/* Record the specified processes as "connected". This means that the resource
* manager should treat the failure of any process in the specified group as
* a reportable event, and take appropriate action. The callback function is
* to be called once all participating processes have called connect. Note that
* a process can only engage in *one* connect operation involving the identical
* set of procs at a time. However, a process *can* be simultaneously engaged
* in multiple connect operations, each involving a different set of procs
*
* The list of opal_value_t includes any directives from the user regarding
* how the operation is to be executed (e.g., timeout limits).
*/
typedef int (*opal_pmix_server_connect_fn_t)(opal_list_t *procs, opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Disconnect a previously connected set of processes. An error should be returned
* if the specified set of procs was not previously "connected". As above, a process
* may be involved in multiple simultaneous disconnect operations. However, a process
* is not allowed to reconnect to a set of ranges that has not fully completed
* disconnect - i.e., you have to fully disconnect before you can reconnect to the
* same group of processes.
*
* The list of opal_value_t includes any directives from the user regarding
* how the operation is to be executed (e.g., timeout limits).
*/
typedef int (*opal_pmix_server_disconnect_fn_t)(opal_list_t *procs, opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Register to receive notifications for the specified events. The resource
* manager may have access to events beyond process failure. In cases where
* the client application requests to be notified of such events, the request
* will be passed to the PMIx server, which in turn shall pass the request to
* the resource manager. The list of opal_value_t will provide the OPAL
* error codes corresponding to the desired events */
typedef int (*opal_pmix_server_register_events_fn_t)(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
/* Deregister from the specified events. The list of opal_value_t will provide the OPAL
* error codes corresponding to the desired events */
typedef int (*opal_pmix_server_deregister_events_fn_t)(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
/* Notify the specified processes of an event generated either by
* the PMIx server itself, or by one of its local clients. The RTE
* is requested to pass the notification to each PMIx server that
* hosts one or more of the specified processes */
typedef int (*opal_pmix_server_notify_fn_t)(int code, opal_process_name_t *source,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* Query the RTE for information - the list is composed of opal_pmix_query_t items */
typedef int (*opal_pmix_server_query_fn_t)(opal_process_name_t *requestor,
opal_list_t *queries,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Register that a tool has connected to the server, and request
* that the tool be assigned a jobid for further interactions.
* The optional opal_value_t list can be used to pass qualifiers for
* the connection request:
*
* (a) OPAL_PMIX_USERID - effective userid of the tool
* (b) OPAL_PMIX_GRPID - effective groupid of the tool
* (c) OPAL_PMIX_FWD_STDOUT - forward any stdout to this tool
* (d) OPAL_PMIX_FWD_STDERR - forward any stderr to this tool
* (e) OPAL_PMIX_FWD_STDIN - forward stdin from this tool to any
* processes spawned on its behalf
*/
typedef void (*opal_pmix_server_tool_connection_fn_t)(opal_list_t *info,
opal_pmix_tool_connection_cbfunc_t cbfunc,
void *cbdata);
/* Log data on behalf of the client */
typedef void (*opal_pmix_server_log_fn_t)(opal_process_name_t *requestor,
opal_list_t *info,
opal_list_t *directives,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
/* Callback function for incoming connection requests from
* local clients */
typedef void (*opal_pmix_connection_cbfunc_t)(int incoming_sd);
/* Register a socket the host server can monitor for connection
* requests, harvest them, and then call our internal callback
* function for further processing. A listener thread is essential
* to efficiently harvesting connection requests from large
* numbers of local clients such as occur when running on large
* SMPs. The host server listener is required to call accept
* on the incoming connection request, and then passing the
* resulting socket to the provided cbfunc. A NULL for this function
* will cause the internal PMIx server to spawn its own listener
* thread */
typedef int (*opal_pmix_server_listener_fn_t)(int listening_sd,
opal_pmix_connection_cbfunc_t cbfunc);
/* Request allocation modifications on behalf of a client */
typedef int (*opal_pmix_server_alloc_fn_t)(const opal_process_name_t *client,
opal_pmix_alloc_directive_t directive,
opal_list_t *data,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Execute a job control action on behalf of a client */
typedef int (*opal_pmix_server_job_control_fn_t)(const opal_process_name_t *requestor,
opal_list_t *targets, opal_list_t *directives,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* we do not provide a monitoring capability */
typedef struct opal_pmix_server_module_1_0_0_t {
opal_pmix_server_client_connected_fn_t client_connected;
opal_pmix_server_client_finalized_fn_t client_finalized;
opal_pmix_server_abort_fn_t abort;
opal_pmix_server_fencenb_fn_t fence_nb;
opal_pmix_server_dmodex_req_fn_t direct_modex;
opal_pmix_server_publish_fn_t publish;
opal_pmix_server_lookup_fn_t lookup;
opal_pmix_server_unpublish_fn_t unpublish;
opal_pmix_server_spawn_fn_t spawn;
opal_pmix_server_connect_fn_t connect;
opal_pmix_server_disconnect_fn_t disconnect;
opal_pmix_server_register_events_fn_t register_events;
opal_pmix_server_deregister_events_fn_t deregister_events;
opal_pmix_server_notify_fn_t notify_event;
opal_pmix_server_query_fn_t query;
opal_pmix_server_tool_connection_fn_t tool_connected;
opal_pmix_server_log_fn_t log;
opal_pmix_server_listener_fn_t listener;
opal_pmix_server_alloc_fn_t allocate;
opal_pmix_server_job_control_fn_t job_control;
} opal_pmix_server_module_t;
END_C_DECLS
#endif