1
1
openmpi/opal/mca/pmix/pmix.h

464 строки
22 KiB
C

/*
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_PMIX_H
#define OPAL_PMIX_H
#include "opal_config.h"
#include "opal/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/event/event.h"
#include "opal/dss/dss.h"
#include "opal/runtime/opal.h"
#include "opal/mca/dstore/dstore.h"
#include "opal/dss/dss.h"
#include "opal/util/error.h"
#include "opal/util/proc.h"
BEGIN_C_DECLS
/* define some maximum sizes */
#define PMIX_MAX_VALLEN 1024
#define PMIX_MAX_INFO_KEY 255
#define PMIX_MAX_INFO_VAL 1024
/* define an INFO object corresponding to
* the MPI_Info structure */
typedef struct {
opal_list_item_t super;
char key[PMIX_MAX_INFO_KEY];
char value[PMIX_MAX_INFO_VAL];
} pmix_info_t;
OBJ_CLASS_DECLARATION(pmix_info_t);
/* define a scope for data "put" by PMI per the following:
*
* PMI_LOCAL - the data is intended only for other application
* processes on the same node. Data marked in this way
* will not be included in data packages sent to remote requestors
* PMI_REMOTE - the data is intended solely for applications processes on
* remote nodes. Data marked in this way will not be shared with
* other processes on the same node
* PMI_GLOBAL - the data is to be shared with all other requesting processes,
* regardless of location
*/
typedef uint8_t opal_pmix_scope_t;
#define PMIX_SCOPE_T OPAL_UINT8
#define PMIX_SCOPE_UNDEF 0
#define PMIX_INTERNAL 1 // data used internally only
#define PMIX_LOCAL 2 // share to procs also on this node
#define PMIX_REMOTE 3 // share with procs not on this node
#define PMIX_GLOBAL 4 // share with all procs (local + remote)
/* callback function for non-blocking operations */
typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
/* flags to indicate if the modex value being pushed into
* the PMIx server comes from an element that is ready to
* support async modex operations, or from one that requires
* synchronous modex (i.e., blocking modex operation) */
#define PMIX_SYNC_REQD true
#define PMIX_ASYNC_RDY false
/* define a set of "standard" PMIx attributes that can
* be queried. Implementations (and users) are free to extend as
* desired, so the get_attr functions need to be capable
* of handling the "not found" condition. Note that these
* are attributes of the system and the job as opposed to
* values the application (or underlying MPI library)
* might choose to expose - i.e., they are values provided
* by the resource manager as opposed to the application */
#define PMIX_ATTR_UNDEF NULL
#define PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
#define PMIX_CREDENTIAL "pmix.cred" // (opal_byte_object*) security credential assigned to proc
#define PMIX_HOSTNAME "pmix.hname" // (char*) name of the host this proc is on
/* scratch directory locations for use by applications */
#define PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session
/* information about relative ranks as assigned */
#define PMIX_JOBID "pmix.jobid" // (char*) jobid assigned by scheduler
#define PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job
#define PMIX_RANK "pmix.rank" // (uint32_t) process rank within the job
#define PMIX_GLOBAL_RANK "pmix.grank" // (uint32_t) rank spanning across all jobs in this session
#define PMIX_APP_RANK "pmix.apprank" // (uint32_t) rank within this app
#define PMIX_NPROC_OFFSET "pmix.offset" // (uint32_t) starting global rank of this job
#define PMIX_LOCAL_RANK "pmix.lrank" // (uint16_t) rank on this node within this job
#define PMIX_NODE_RANK "pmix.nrank" // (uint16_t) rank on this node spanning all jobs
#define PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job
#define PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job
#define PMIX_NODE_ID "pmix.nodeid" // (uint32_t) vpid of daemon hosting specified proc
/* proc location-related info */
#define PMIX_PROC_MAP "pmix.map" // (byte_object) packed map of proc locations within this job
#define PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within this job
#define PMIX_LOCAL_CPUSETS "pmix.lcpus" // (byte_object) packed names and cpusets of local peers
/* size info */
#define PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this namespace
#define PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job
#define PMIX_LOCAL_SIZE "pmix.local.size" // (uint32_t) #procs in this job on this node
#define PMIX_NODE_SIZE "pmix.node.size" // (uint32_t) #procs across all jobs on this node
#define PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job
/* topology info */
#define PMIX_NET_TOPO "pmix.ntopo" // (byte_object) network topology
#define PMIX_LOCAL_TOPO "pmix.ltopo" // (hwloc topo) local node topology
/**
* Provide a simplified macro for sending data via modex
* to other processes. The macro requires four arguments:
*
* r - the integer return status from the modex op
* f - whether this modex requires sync or is async ready
* sc - the PMIX scope of the data
* s - the key to tag the data being posted
* d - pointer to the data object being posted
* t - the type of the data
*/
#define OPAL_MODEX_SEND_VALUE(r, f, sc, s, d, t) \
do { \
opal_value_t kv; \
if (PMIX_SYNC_REQD == (f)) { \
opal_pmix_use_collective = true; \
} \
OBJ_CONSTRUCT(&kv, opal_value_t); \
kv.key = (s); \
if (OPAL_SUCCESS != ((r) = opal_value_load(&kv, (d), (t)))) { \
OPAL_ERROR_LOG((r)); \
} else { \
if (OPAL_SUCCESS != ((r) = opal_pmix.put(sc, &kv))) { \
OPAL_ERROR_LOG((r)); \
} \
} \
/* do not destruct the keyval as we don't own */ \
/* the data - the caller will take care of the */ \
/* key and value storage, and the kv itself has none */ \
} while(0);
/**
* Provide a simplified macro for sending data via modex
* to other processes. The macro requires four arguments:
*
* r - the integer return status from the modex op
* f - whether this modex requires sync or is async ready
* sc - the PMIX scope of the data
* s - the key to tag the data being posted
* d - the data object being posted
* sz - the number of bytes in the data object
*/
#define OPAL_MODEX_SEND_STRING(r, f, sc, s, d, sz) \
do { \
opal_value_t kv; \
if (PMIX_SYNC_REQD == (f)) { \
opal_pmix_use_collective = true; \
} \
OBJ_CONSTRUCT(&kv, opal_value_t); \
kv.key = (s); \
kv.type = OPAL_BYTE_OBJECT; \
kv.data.bo.bytes = (uint8_t*)(d); \
kv.data.bo.size = (sz); \
if (OPAL_SUCCESS != ((r) = opal_pmix.put(sc, &kv))) { \
OPAL_ERROR_LOG((r)); \
} \
kv.data.bo.bytes = NULL; /* protect the data */ \
kv.key = NULL; /* protect the key */ \
OBJ_DESTRUCT(&kv); \
} while(0);
/**
* Provide a simplified macro for sending data via modex
* to other processes. The macro requires four arguments:
*
* r - the integer return status from the modex op
* f - whether this modex requires sync or is async ready
* sc - the PMIX scope of the data
* s - the MCA component that is posting the data
* d - the data object being posted
* sz - the number of bytes in the data object
*/
#define OPAL_MODEX_SEND(r, f, sc, s, d, sz) \
do { \
char *key; \
if (PMIX_SYNC_REQD == (f)) { \
opal_pmix_use_collective = true; \
} \
key = mca_base_component_to_string((s)); \
OPAL_MODEX_SEND_STRING((r), (f), (sc), key, (d), (sz)); \
free(key); \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:
*
* r - the integer return status from the modex op (int)
* s - string key (char*)
* p - pointer to the opal_proc_t of the proc that posted
* the data (opal_proc_t*)
* d - pointer to a location wherein the data object
* is to be returned
* t - the expected data type
*/
#define OPAL_MODEX_RECV_VALUE(r, s, p, d, t) \
do { \
opal_value_t *kv; \
if (OPAL_SUCCESS != ((r) = opal_pmix.get(&(p)->proc_name, \
(s), &kv))) { \
*(d) = NULL; \
} else { \
(r) = opal_value_unload(kv, (void**)(d), (t)); \
OBJ_RELEASE(kv); \
} \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:
*
* r - the integer return status from the modex op (int)
* s - string key (char*)
* p - pointer to the opal_proc_t of the proc that posted
* the data (opal_proc_t*)
* d - pointer to a location wherein the data object
* it to be returned (char**)
* sz - pointer to a location wherein the number of bytes
* in the data object can be returned (size_t)
*/
#define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \
do { \
opal_value_t *kv; \
if (OPAL_SUCCESS == ((r) = opal_pmix.get(&(p)->proc_name, \
(s), &kv)) && \
NULL != kv) { \
*(d) = kv->data.bo.bytes; \
*(sz) = kv->data.bo.size; \
kv->data.bo.bytes = NULL; /* protect the data */ \
OBJ_RELEASE(kv); \
} else { \
*(d) = NULL; \
*(sz) = 0; \
} \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:
*
* r - the integer return status from the modex op (int)
* s - the MCA component that posted the data (mca_base_component_t*)
* p - pointer to the opal_proc_t of the proc that posted
* the data (opal_proc_t*)
* d - pointer to a location wherein the data object
* it to be returned (char**)
* sz - pointer to a location wherein the number of bytes
* in the data object can be returned (size_t)
*/
#define OPAL_MODEX_RECV(r, s, p, d, sz) \
do { \
char *key; \
key = mca_base_component_to_string((s)); \
if (NULL == key) { \
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); \
(r) = OPAL_ERR_OUT_OF_RESOURCE; \
} else { \
OPAL_MODEX_RECV_STRING((r), key, (p), (d), (sz)); \
free(key); \
} \
} while(0);
/**
* Provide a simplified macro for calling the fence function
* that takes into account directives and availability of
* non-blocking operations
*/
#define OPAL_FENCE(p, s, cf, cd) \
do { \
if (opal_pmix_use_collective || NULL == opal_pmix.fence_nb) { \
opal_pmix.fence((p), (s)); \
} else { \
opal_pmix.fence_nb((p), (s), (cf), (cd)); \
} \
} while(0);
/* callback handler for errors */
typedef void (*opal_pmix_errhandler_fn_t)(int error);
/**** DEFINE THE PUBLIC API'S ****
**** NOTE THAT WE DO NOT HAVE A 1:1 MAPPING OF APIs ****
**** HERE TO THOSE CURRENTLY DEFINED BY PMI AS WE ****
**** DON'T USE SOME OF THOSE FUNCTIONS AND THIS ISN'T ****
**** A GENERAL LIBRARY ****/
/***** APIs CURRENTLY USED IN THE OMPI/ORTE CODE BASE ****/
/* NOTE: calls to these APIs must be thread-protected as there
* currently is NO internal thread safety. */
/* Init */
typedef int (*opal_pmix_base_module_init_fn_t)(void);
/* Finalize */
typedef int (*opal_pmix_base_module_fini_fn_t)(void);
/* Initialized */
typedef bool (*opal_pmix_base_module_initialized_fn_t)(void);
/* Abort */
typedef int (*opal_pmix_base_module_abort_fn_t)(int flag, const char msg[]);
/* Fence - note that this call is required to commit any
* data "put" to the system since the last call to "fence"
* prior to (or as part of) executing the barrier. Serves both PMI2
* and PMI1 "barrier" purposes */
typedef int (*opal_pmix_base_module_fence_fn_t)(opal_process_name_t *procs, size_t nprocs);
/* Fence_nb - not included in the current PMI standard. This is a non-blocking
* version of the standard "fence" call. All subsequent "get" calls will block
* pending completion of this operation. Non-blocking "get" calls will still
* complete as data becomes available */
typedef int (*opal_pmix_base_module_fence_nb_fn_t)(opal_process_name_t *procs, size_t nprocs,
opal_pmix_cbfunc_t cbfunc, void *cbdata);
/* Put - note that this API has been modified from the current PMI standard to
* reflect the proposed PMIx extensions. */
typedef int (*opal_pmix_base_module_put_fn_t)(opal_pmix_scope_t scope,
opal_value_t *kv);
/* Get - note that this API has been modified from the current PMI standard to
* reflect the proposed PMIx extensions, and to include the process identifier so
* we can form the PMI key within the active component instead of sprinkling that
* code all over the code base. */
typedef int (*opal_pmix_base_module_get_fn_t)(const opal_process_name_t *id,
const char *key,
opal_value_t **kv);
/* Get_nb - not included in the current PMI standard. This is a non-blocking
* version of the standard "get" call. Retrieved value will be provided as
* opal_value_t object in the callback. We include the process identifier so
* we can form the PMI key within the active component instead of sprinkling that
* code all over the code base. */
typedef void (*opal_pmix_base_module_get_nb_fn_t)(const opal_process_name_t *id,
const char *key,
opal_pmix_cbfunc_t cbfunc,
void *cbdata);
/* Publish - the "info" parameter
* consists of a list of pmix_info_t objects */
typedef int (*opal_pmix_base_module_publish_fn_t)(const char service_name[],
opal_list_t *info,
const char port[]);
/* Lookup - the "info" parameter
* consists of a list of pmix_info_t objects */
typedef int (*opal_pmix_base_module_lookup_fn_t)(const char service_name[],
opal_list_t *info,
char port[], int portLen);
/* Unpublish - the "info" parameter
* consists of a list of pmix_info_t objects */
typedef int (*opal_pmix_base_module_unpublish_fn_t)(const char service_name[],
opal_list_t *info);
/* Get attribute
* Query the server for the specified attribute, returning it in the
* provided opal_value_t. The function will return "true" if the attribute
* is found, and "false" if not.
* Attributes are provided by the PMIx server, so there is no corresponding
* "put" function. */
typedef bool (*opal_pmix_base_module_get_attr_fn_t)(const char *attr, opal_value_t **kv);
/* Get attribute (non-blocking)
* Query the server for the specified attribute..
* Attributes are provided by the PMIx server, so there is no corresponding "put"
* function. The call will be executed as non-blocking, returning immediately,
* with data resulting from the call returned in the callback function. A returned
* NULL opal_value_t* indicates that the attribute was not found. The returned
* pointer is "owned" by the PMIx module and must not be released by the
* callback function */
typedef int (*opal_pmix_base_module_get_attr_nb_fn_t)(const char *attr,
opal_pmix_cbfunc_t cbfunc,
void *cbdata);
/**** APIs NOT CURRENTLY USED IN THE OMPI/ORTE CODE BASE, BUT THAT ****
**** MAY BE IMPLEMENTED IN THE NEAR FUTURE. COMPONENTS ARE FREE TO ****
**** JUST HAVE THEM RETURN "OPAL_ERR_NOT_IMPLEMENTED" ****/
/* PMI2_Job_Spawn */
typedef int (*opal_pmix_base_module_spawn_fn_t)(int count, const char * cmds[],
int argcs[], const char ** argvs[],
const int maxprocs[],
opal_list_t *info_keyval_vector,
opal_list_t *preput_keyval_vector,
char jobId[], int jobIdSize,
int errors[]);
/* PMI2_Job_Connect */
typedef int (*opal_pmix_base_module_job_connect_fn_t)(const char jobId[]);
/* PMI2_Job_Disconnect */
typedef int (*opal_pmix_base_module_job_disconnect_fn_t)(const char jobId[]);
/* register an errhandler to report loss of connection to the server */
typedef void (*opal_pmix_base_module_register_fn_t)(opal_pmix_errhandler_fn_t errhandler);
/* deregister the errhandler */
typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
/*
* the standard public API data structure
*/
typedef struct {
/* currently used APIs */
opal_pmix_base_module_init_fn_t init;
opal_pmix_base_module_fini_fn_t finalize;
opal_pmix_base_module_initialized_fn_t initialized;
opal_pmix_base_module_abort_fn_t abort;
opal_pmix_base_module_fence_fn_t fence;
opal_pmix_base_module_fence_nb_fn_t fence_nb;
opal_pmix_base_module_put_fn_t put;
opal_pmix_base_module_get_fn_t get;
opal_pmix_base_module_get_nb_fn_t get_nb;
opal_pmix_base_module_publish_fn_t publish;
opal_pmix_base_module_lookup_fn_t lookup;
opal_pmix_base_module_unpublish_fn_t unpublish;
opal_pmix_base_module_get_attr_fn_t get_attr;
opal_pmix_base_module_get_attr_nb_fn_t get_attr_nb;
/* currently unused APIs */
opal_pmix_base_module_spawn_fn_t spawn;
opal_pmix_base_module_job_connect_fn_t job_connect;
opal_pmix_base_module_job_disconnect_fn_t job_disconnect;
/* register the errhandler */
opal_pmix_base_module_register_fn_t register_errhandler;
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
} opal_pmix_base_module_t;
typedef struct {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
int priority;
} opal_pmix_base_component_t;
/*
* Macro for use in components that are of type pmix
*/
#define OPAL_PMIX_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"pmix", 2, 0, 0
/* Global structure for accessing store functions */
OPAL_DECLSPEC extern opal_pmix_base_module_t opal_pmix; /* holds base function pointers */
/* flag to indicate collective vs direct fence operations */
OPAL_DECLSPEC extern bool opal_pmix_use_collective;
END_C_DECLS
#endif