/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef OPAL_PMIX_H #define OPAL_PMIX_H #include "opal_config.h" #include "opal/types.h" #include "opal/mca/mca.h" #include "opal/mca/event/event.h" #include "opal/dss/dss.h" #include "opal/runtime/opal.h" #include "opal/mca/dstore/dstore.h" #include "opal/util/error.h" #include "opal/util/proc.h" BEGIN_C_DECLS /* define some maximum sizes */ #define PMIX_MAX_VALLEN 1024 #define PMIX_MAX_INFO_KEY 255 #define PMIX_MAX_INFO_VAL 1024 /* define an INFO object corresponding to * the MPI_Info structure */ typedef struct { opal_list_item_t super; char key[PMIX_MAX_INFO_KEY]; char value[PMIX_MAX_INFO_VAL]; } pmix_info_t; OBJ_CLASS_DECLARATION(pmix_info_t); /* define a scope for data "put" by PMI per the following: * * PMI_LOCAL - the data is intended only for other application * processes on the same node. Data marked in this way * will not be included in data packages sent to remote requestors * PMI_REMOTE - the data is intended solely for applications processes on * remote nodes. Data marked in this way will not be shared with * other processes on the same node * PMI_GLOBAL - the data is to be shared with all other requesting processes, * regardless of location */ typedef uint8_t opal_pmix_scope_t; #define PMIX_SCOPE_T OPAL_UINT8 #define PMIX_SCOPE_UNDEF 0 #define PMIX_INTERNAL 1 // data used internally only #define PMIX_LOCAL 2 // share to procs also on this node #define PMIX_REMOTE 3 // share with procs not on this node #define PMIX_GLOBAL 4 // share with all procs (local + remote) /* callback function for non-blocking operations */ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata); /* flags to indicate if the modex value being pushed into * the PMIx server comes from an element that is ready to * support async modex operations, or from one that requires * synchronous modex (i.e., blocking modex operation) */ #define PMIX_SYNC_REQD true #define PMIX_ASYNC_RDY false /* define a set of "standard" PMIx attributes that can * be queried. Implementations (and users) are free to extend as * desired, so the get_attr functions need to be capable * of handling the "not found" condition. Note that these * are attributes of the system and the job as opposed to * values the application (or underlying MPI library) * might choose to expose - i.e., they are values provided * by the resource manager as opposed to the application */ #define PMIX_ATTR_UNDEF NULL #define PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch #define PMIX_CREDENTIAL "pmix.cred" // (opal_byte_object*) security credential assigned to proc #define PMIX_HOSTNAME "pmix.hname" // (char*) name of the host this proc is on /* scratch directory locations for use by applications */ #define PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session /* information about relative ranks as assigned */ #define PMIX_JOBID "pmix.jobid" // (char*) jobid assigned by scheduler #define PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job #define PMIX_RANK "pmix.rank" // (uint32_t) process rank within the job #define PMIX_GLOBAL_RANK "pmix.grank" // (uint32_t) rank spanning across all jobs in this session #define PMIX_APP_RANK "pmix.apprank" // (uint32_t) rank within this app #define PMIX_NPROC_OFFSET "pmix.offset" // (uint32_t) starting global rank of this job #define PMIX_LOCAL_RANK "pmix.lrank" // (uint16_t) rank on this node within this job #define PMIX_NODE_RANK "pmix.nrank" // (uint16_t) rank on this node spanning all jobs #define PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job #define PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job #define PMIX_NODE_ID "pmix.nodeid" // (uint32_t) vpid of daemon hosting specified proc /* proc location-related info */ #define PMIX_PROC_MAP "pmix.map" // (byte_object) packed map of proc locations within this job #define PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within this job #define PMIX_LOCAL_CPUSETS "pmix.lcpus" // (byte_object) packed names and cpusets of local peers /* size info */ #define PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this namespace #define PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job #define PMIX_LOCAL_SIZE "pmix.local.size" // (uint32_t) #procs in this job on this node #define PMIX_NODE_SIZE "pmix.node.size" // (uint32_t) #procs across all jobs on this node #define PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job /* topology info */ #define PMIX_NET_TOPO "pmix.ntopo" // (byte_object) network topology #define PMIX_LOCAL_TOPO "pmix.ltopo" // (hwloc topo) local node topology /** * Provide a simplified macro for sending data via modex * to other processes. The macro requires four arguments: * * r - the integer return status from the modex op * f - whether this modex requires sync or is async ready * sc - the PMIX scope of the data * s - the key to tag the data being posted * d - pointer to the data object being posted * t - the type of the data */ #define OPAL_MODEX_SEND_VALUE(r, f, sc, s, d, t) \ do { \ opal_value_t kv; \ if (PMIX_SYNC_REQD == (f)) { \ opal_pmix_use_collective = true; \ } \ OBJ_CONSTRUCT(&kv, opal_value_t); \ kv.key = (s); \ if (OPAL_SUCCESS != ((r) = opal_value_load(&kv, (d), (t)))) { \ OPAL_ERROR_LOG((r)); \ } else { \ if (OPAL_SUCCESS != ((r) = opal_pmix.put(sc, &kv))) { \ OPAL_ERROR_LOG((r)); \ } \ } \ /* do not destruct the keyval as we don't own */ \ /* the data - the caller will take care of the */ \ /* key and value storage, and the kv itself has none */ \ } while(0); /** * Provide a simplified macro for sending data via modex * to other processes. The macro requires four arguments: * * r - the integer return status from the modex op * f - whether this modex requires sync or is async ready * sc - the PMIX scope of the data * s - the key to tag the data being posted * d - the data object being posted * sz - the number of bytes in the data object */ #define OPAL_MODEX_SEND_STRING(r, f, sc, s, d, sz) \ do { \ opal_value_t kv; \ if (PMIX_SYNC_REQD == (f)) { \ opal_pmix_use_collective = true; \ } \ OBJ_CONSTRUCT(&kv, opal_value_t); \ kv.key = (s); \ kv.type = OPAL_BYTE_OBJECT; \ kv.data.bo.bytes = (uint8_t*)(d); \ kv.data.bo.size = (sz); \ if (OPAL_SUCCESS != ((r) = opal_pmix.put(sc, &kv))) { \ OPAL_ERROR_LOG((r)); \ } \ kv.data.bo.bytes = NULL; /* protect the data */ \ kv.key = NULL; /* protect the key */ \ OBJ_DESTRUCT(&kv); \ } while(0); /** * Provide a simplified macro for sending data via modex * to other processes. The macro requires four arguments: * * r - the integer return status from the modex op * f - whether this modex requires sync or is async ready * sc - the PMIX scope of the data * s - the MCA component that is posting the data * d - the data object being posted * sz - the number of bytes in the data object */ #define OPAL_MODEX_SEND(r, f, sc, s, d, sz) \ do { \ char *key; \ if (PMIX_SYNC_REQD == (f)) { \ opal_pmix_use_collective = true; \ } \ key = mca_base_component_to_string((s)); \ OPAL_MODEX_SEND_STRING((r), (f), (sc), key, (d), (sz)); \ free(key); \ } while(0); /** * Provide a simplified macro for retrieving modex data * from another process: * * r - the integer return status from the modex op (int) * s - string key (char*) * p - pointer to the opal_proc_t of the proc that posted * the data (opal_proc_t*) * d - pointer to a location wherein the data object * is to be returned * t - the expected data type */ #define OPAL_MODEX_RECV_VALUE(r, s, p, d, t) \ do { \ opal_value_t *kv; \ if (OPAL_SUCCESS != ((r) = opal_pmix.get(&(p)->proc_name, \ (s), &kv))) { \ *(d) = NULL; \ } else { \ (r) = opal_value_unload(kv, (void**)(d), (t)); \ OBJ_RELEASE(kv); \ } \ } while(0); /** * Provide a simplified macro for retrieving modex data * from another process: * * r - the integer return status from the modex op (int) * s - string key (char*) * p - pointer to the opal_proc_t of the proc that posted * the data (opal_proc_t*) * d - pointer to a location wherein the data object * it to be returned (char**) * sz - pointer to a location wherein the number of bytes * in the data object can be returned (size_t) */ #define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \ do { \ opal_value_t *kv; \ if (OPAL_SUCCESS == ((r) = opal_pmix.get(&(p)->proc_name, \ (s), &kv)) && \ NULL != kv) { \ *(d) = kv->data.bo.bytes; \ *(sz) = kv->data.bo.size; \ kv->data.bo.bytes = NULL; /* protect the data */ \ OBJ_RELEASE(kv); \ } else { \ *(d) = NULL; \ *(sz) = 0; \ } \ } while(0); /** * Provide a simplified macro for retrieving modex data * from another process: * * r - the integer return status from the modex op (int) * s - the MCA component that posted the data (mca_base_component_t*) * p - pointer to the opal_proc_t of the proc that posted * the data (opal_proc_t*) * d - pointer to a location wherein the data object * it to be returned (char**) * sz - pointer to a location wherein the number of bytes * in the data object can be returned (size_t) */ #define OPAL_MODEX_RECV(r, s, p, d, sz) \ do { \ char *key; \ key = mca_base_component_to_string((s)); \ if (NULL == key) { \ OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); \ (r) = OPAL_ERR_OUT_OF_RESOURCE; \ } else { \ OPAL_MODEX_RECV_STRING((r), key, (p), (d), (sz)); \ free(key); \ } \ } while(0); /** * Provide a simplified macro for calling the fence function * that takes into account directives and availability of * non-blocking operations */ #define OPAL_FENCE(p, s, cf, cd) \ opal_pmix.fence((p), (s)); /**** DEFINE THE PUBLIC API'S **** **** NOTE THAT WE DO NOT HAVE A 1:1 MAPPING OF APIs **** **** HERE TO THOSE CURRENTLY DEFINED BY PMI AS WE **** **** DON'T USE SOME OF THOSE FUNCTIONS AND THIS ISN'T **** **** A GENERAL LIBRARY ****/ /***** APIs CURRENTLY USED IN THE OMPI/ORTE CODE BASE ****/ /* NOTE: calls to these APIs must be thread-protected as there * currently is NO internal thread safety. */ /* Init */ typedef int (*opal_pmix_base_module_init_fn_t)(void); /* Finalize */ typedef int (*opal_pmix_base_module_fini_fn_t)(void); /* Initialized */ typedef bool (*opal_pmix_base_module_initialized_fn_t)(void); /* Abort */ typedef int (*opal_pmix_base_module_abort_fn_t)(int flag, const char msg[]); /* Fence - note that this call is required to commit any * data "put" to the system since the last call to "fence" * prior to (or as part of) executing the barrier. Serves both PMI2 * and PMI1 "barrier" purposes */ typedef int (*opal_pmix_base_module_fence_fn_t)(opal_process_name_t *procs, size_t nprocs); /* Fence_nb - not included in the current PMI standard. This is a non-blocking * version of the standard "fence" call. All subsequent "get" calls will block * pending completion of this operation. Non-blocking "get" calls will still * complete as data becomes available */ typedef int (*opal_pmix_base_module_fence_nb_fn_t)(opal_process_name_t *procs, size_t nprocs, opal_pmix_cbfunc_t cbfunc, void *cbdata); /* Put - note that this API has been modified from the current PMI standard to * reflect the proposed PMIx extensions. */ typedef int (*opal_pmix_base_module_put_fn_t)(opal_pmix_scope_t scope, opal_value_t *kv); /* Get - note that this API has been modified from the current PMI standard to * reflect the proposed PMIx extensions, and to include the process identifier so * we can form the PMI key within the active component instead of sprinkling that * code all over the code base. */ typedef int (*opal_pmix_base_module_get_fn_t)(const opal_process_name_t *id, const char *key, opal_value_t **kv); /* Get_nb - not included in the current PMI standard. This is a non-blocking * version of the standard "get" call. Retrieved value will be provided as * opal_value_t object in the callback. We include the process identifier so * we can form the PMI key within the active component instead of sprinkling that * code all over the code base. */ typedef void (*opal_pmix_base_module_get_nb_fn_t)(const opal_process_name_t *id, const char *key, opal_pmix_cbfunc_t cbfunc, void *cbdata); /* Publish - the "info" parameter * consists of a list of pmix_info_t objects */ typedef int (*opal_pmix_base_module_publish_fn_t)(const char service_name[], opal_list_t *info, const char port[]); /* Lookup - the "info" parameter * consists of a list of pmix_info_t objects */ typedef int (*opal_pmix_base_module_lookup_fn_t)(const char service_name[], opal_list_t *info, char port[], int portLen); /* Unpublish - the "info" parameter * consists of a list of pmix_info_t objects */ typedef int (*opal_pmix_base_module_unpublish_fn_t)(const char service_name[], opal_list_t *info); /* Get attribute * Query the server for the specified attribute, returning it in the * provided opal_value_t. The function will return "true" if the attribute * is found, and "false" if not. * Attributes are provided by the PMIx server, so there is no corresponding * "put" function. */ typedef bool (*opal_pmix_base_module_get_attr_fn_t)(const char *attr, opal_value_t **kv); /* Get attribute (non-blocking) * Query the server for the specified attribute.. * Attributes are provided by the PMIx server, so there is no corresponding "put" * function. The call will be executed as non-blocking, returning immediately, * with data resulting from the call returned in the callback function. A returned * NULL opal_value_t* indicates that the attribute was not found. The returned * pointer is "owned" by the PMIx module and must not be released by the * callback function */ typedef int (*opal_pmix_base_module_get_attr_nb_fn_t)(const char *attr, opal_pmix_cbfunc_t cbfunc, void *cbdata); /**** APIs NOT CURRENTLY USED IN THE OMPI/ORTE CODE BASE, BUT THAT **** **** MAY BE IMPLEMENTED IN THE NEAR FUTURE. COMPONENTS ARE FREE TO **** **** JUST HAVE THEM RETURN "OPAL_ERR_NOT_IMPLEMENTED" ****/ /* PMI2_Job_Spawn */ typedef int (*opal_pmix_base_module_spawn_fn_t)(int count, const char * cmds[], int argcs[], const char ** argvs[], const int maxprocs[], opal_list_t *info_keyval_vector, opal_list_t *preput_keyval_vector, char jobId[], int jobIdSize, int errors[]); /* PMI2_Job_Connect */ typedef int (*opal_pmix_base_module_job_connect_fn_t)(const char jobId[]); /* PMI2_Job_Disconnect */ typedef int (*opal_pmix_base_module_job_disconnect_fn_t)(const char jobId[]); /* * the standard public API data structure */ typedef struct { /* currently used APIs */ opal_pmix_base_module_init_fn_t init; opal_pmix_base_module_fini_fn_t finalize; opal_pmix_base_module_initialized_fn_t initialized; opal_pmix_base_module_abort_fn_t abort; opal_pmix_base_module_fence_fn_t fence; opal_pmix_base_module_fence_nb_fn_t fence_nb; opal_pmix_base_module_put_fn_t put; opal_pmix_base_module_get_fn_t get; opal_pmix_base_module_get_nb_fn_t get_nb; opal_pmix_base_module_publish_fn_t publish; opal_pmix_base_module_lookup_fn_t lookup; opal_pmix_base_module_unpublish_fn_t unpublish; opal_pmix_base_module_get_attr_fn_t get_attr; opal_pmix_base_module_get_attr_nb_fn_t get_attr_nb; /* currently unused APIs */ opal_pmix_base_module_spawn_fn_t spawn; opal_pmix_base_module_job_connect_fn_t job_connect; opal_pmix_base_module_job_disconnect_fn_t job_disconnect; } opal_pmix_base_module_t; typedef struct { mca_base_component_t base_version; mca_base_component_data_t base_data; int priority; } opal_pmix_base_component_t; /* * Macro for use in components that are of type pmix */ #define OPAL_PMIX_BASE_VERSION_2_0_0 \ OPAL_MCA_BASE_VERSION_2_1_0("pmix", 2, 0, 0) /* Global structure for accessing store functions */ OPAL_DECLSPEC extern opal_pmix_base_module_t opal_pmix; /* holds base function pointers */ /* flag to indicate collective vs direct fence operations */ OPAL_DECLSPEC extern bool opal_pmix_use_collective; END_C_DECLS #endif