Per the RFC issued here:
http://www.open-mpi.org/community/lists/devel/2014/05/14827.php Refactor PMI support This commit was SVN r31907.
Этот коммит содержится в:
родитель
a1485569b9
Коммит
1107f9099e
@ -12,10 +12,6 @@
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
@ -37,40 +33,21 @@ static int init(void)
|
||||
*/
|
||||
static int publish ( const char *service_name, ompi_info_t *info, const char *port_name )
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if (PMI_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_publish");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Publish_name");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
return mca_common_pmi_publish(service_name,port_name);
|
||||
}
|
||||
|
||||
static char* lookup ( const char *service_name, ompi_info_t *info )
|
||||
{
|
||||
char *port=NULL;
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */
|
||||
if (PMI_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_lookup");
|
||||
free(port);
|
||||
return NULL;
|
||||
int rc = mca_common_pmi_lookup(service_name, &port);
|
||||
/* in error case port will be set to NULL
|
||||
* this is what our callers expect to see
|
||||
* In future maybe som error handling need?
|
||||
*/
|
||||
if( rc != OPAL_SUCCESS ){
|
||||
// improove error processing
|
||||
return port; // NULL ?
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Lookup_name");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
return port;
|
||||
}
|
||||
|
||||
@ -78,20 +55,7 @@ static char* lookup ( const char *service_name, ompi_info_t *info )
|
||||
* delete the entry */
|
||||
static int unpublish ( const char *service_name, ompi_info_t *info )
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if (PMI_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Unpublish_name(service_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;;
|
||||
return mca_common_pmi_unpublish( service_name );
|
||||
}
|
||||
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
@ -73,7 +74,9 @@ static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
/* if we are indirectly launched via orted, the
|
||||
* selection will have been turned "off" for us
|
||||
*/
|
||||
if (mca_common_pmi_init ()) {
|
||||
int rc = mca_common_pmi_init (opal_pmi_version);
|
||||
|
||||
if ( OPAL_SUCCESS == rc ) {
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&ompi_pubsub_pmi_module;
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -12,10 +12,7 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/threads/tsd.h"
|
||||
#include "ompi/constants.h"
|
||||
@ -35,30 +32,19 @@ coll_construct(ompi_rte_collective_t *coll)
|
||||
OBJ_CLASS_INSTANCE(ompi_rte_collective_t, opal_object_t, coll_construct, NULL);
|
||||
|
||||
|
||||
int
|
||||
ompi_rte_modex(ompi_rte_collective_t *coll)
|
||||
int ompi_rte_modex(ompi_rte_collective_t *coll)
|
||||
{
|
||||
int len, ret;
|
||||
char *kvs;
|
||||
|
||||
ret = PMI_KVS_Get_name_length_max(&len);
|
||||
if (PMI_SUCCESS != ret) return OMPI_ERROR;
|
||||
|
||||
len = mca_common_pmi_kvslen();
|
||||
kvs = malloc(len);
|
||||
if (NULL == kvs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ret = PMI_KVS_Get_my_name(kvs, len);
|
||||
if (PMI_SUCCESS != ret) return OMPI_ERROR;
|
||||
|
||||
ret = PMI_KVS_Commit(kvs);
|
||||
if (PMI_SUCCESS != ret) return OMPI_ERROR;
|
||||
|
||||
ret = PMI_Barrier();
|
||||
if (PMI_SUCCESS != ret) return OMPI_ERROR;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
mca_common_pmi_kvsname(kvs, len);
|
||||
return mca_common_pmi_commit(kvs);
|
||||
}
|
||||
|
||||
|
||||
@ -67,10 +53,10 @@ ompi_rte_barrier(ompi_rte_collective_t *coll)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = PMI_Barrier();
|
||||
if (PMI_SUCCESS != ret) return OMPI_ERROR;
|
||||
ret = mca_common_pmi_barrier();
|
||||
if (OPAL_SUCCESS != ret)
|
||||
return OMPI_ERROR;
|
||||
|
||||
coll->active = false;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -14,10 +14,8 @@
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
@ -70,30 +68,14 @@ ompi_rte_init(int *argc, char ***argv)
|
||||
char *node_info;
|
||||
hwloc_obj_t root;
|
||||
hwloc_cpuset_t boundset, rootset;
|
||||
char *tmp_str;
|
||||
char *tmp_str, *error;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
{
|
||||
int spawned, appnum;
|
||||
|
||||
if (PMI2_Initialized ()) return OMPI_SUCCESS;
|
||||
if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
// Initialize PMI
|
||||
int rc = mca_common_pmi_init (opal_pmi_version);
|
||||
|
||||
if ( OPAL_SUCCESS != rc ) {
|
||||
return rc;
|
||||
}
|
||||
#else
|
||||
{
|
||||
PMI_BOOL initialized;
|
||||
|
||||
if (PMI_SUCCESS != PMI_Initialized(&initialized)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (PMI_TRUE != initialized && PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* be kind, set line buffering */
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
@ -103,19 +85,27 @@ ompi_rte_init(int *argc, char ***argv)
|
||||
return ret;
|
||||
}
|
||||
|
||||
PMI_Get_appnum(&tmp);
|
||||
// Setup job name
|
||||
tmp = mca_common_pmi_appnum();
|
||||
ompi_rte_my_process_name.jobid = tmp;
|
||||
PMI_Get_rank(&rank);
|
||||
ompi_rte_my_process_name.vpid = rank;
|
||||
|
||||
ompi_process_info.app_num = ompi_rte_my_process_name.jobid;
|
||||
ompi_process_info.pid = getpid();
|
||||
PMI_Get_size(&size);
|
||||
|
||||
// Setup rank information
|
||||
rank = mca_common_pmi_rank();
|
||||
ompi_rte_my_process_name.vpid = rank;
|
||||
|
||||
// Setup process groups size
|
||||
size = mca_common_pmi_size();
|
||||
ompi_process_info.num_procs = size;
|
||||
PMI_Get_clique_size(&tmp);
|
||||
node_ranks = malloc(tmp * sizeof(int));
|
||||
if (NULL == node_ranks) return OMPI_ERROR;
|
||||
PMI_Get_clique_ranks(node_ranks, tmp);
|
||||
|
||||
|
||||
rc = mca_common_pmi_local_info(rank, &node_ranks, &tmp, &error);
|
||||
if( OPAL_SUCCESS != rc ){
|
||||
// FIX ME: maybe we somehow should use error message to
|
||||
// help user understand the reason of failure?
|
||||
return rc;
|
||||
}
|
||||
ompi_process_info.num_local_peers = tmp;
|
||||
for (i = 0 ; i < ompi_process_info.num_local_peers ; ++i) {
|
||||
if (rank == node_ranks[i]) {
|
||||
@ -164,8 +154,7 @@ ompi_rte_init(int *argc, char ***argv)
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
/* Fill in things the attributes want to know... */
|
||||
ret = PMI_Get_universe_size(&tmp);
|
||||
if (OMPI_SUCCESS != ret) return OMPI_ERROR;
|
||||
tmp = mca_common_pmi_universe();
|
||||
asprintf(&tmp_str, "%d", tmp);
|
||||
setenv("OMPI_UNIVERSE_SIZE", tmp_str, 1);
|
||||
free(tmp_str);
|
||||
@ -195,6 +184,7 @@ ompi_rte_finalize(void)
|
||||
{
|
||||
ompi_rte_pmi_db_fini();
|
||||
ompi_rte_pmi_name_fini();
|
||||
mca_common_pmi_finalize();
|
||||
opal_finalize();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -12,10 +12,7 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -66,22 +63,12 @@ OBJ_CLASS_INSTANCE(local_data_t,
|
||||
*/
|
||||
static int kvs_put(const char *key, const char *value)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
return PMI2_KVS_Put(key, value);
|
||||
#else
|
||||
return PMI_KVS_Put(pmi_kvs_name, key, value);
|
||||
#endif
|
||||
return mca_common_pmi_put(pmi_kvs_name, key, value);
|
||||
}
|
||||
|
||||
static int kvs_get(const char *key, char *value, int valuelen)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
int len;
|
||||
|
||||
return PMI2_KVS_Get(pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
|
||||
#else
|
||||
return PMI_KVS_Get(pmi_kvs_name, key, value, valuelen);
|
||||
#endif
|
||||
return mca_common_pmi_get(pmi_kvs_name, key, value, valuelen);
|
||||
}
|
||||
|
||||
|
||||
@ -89,45 +76,19 @@ static int setup_pmi(void)
|
||||
{
|
||||
int max_length, rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
pmi_vallen_max = PMI2_MAX_VALLEN;
|
||||
#else
|
||||
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_vallen_max = mca_common_pmi_vallen();
|
||||
max_length = mca_common_pmi_kvslen();
|
||||
pmi_keylen_max = mca_common_pmi_keylen();
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
/* TODO -- is this ok */
|
||||
max_length = 1024;
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_kvs_name = (char*)malloc(max_length);
|
||||
if (NULL == pmi_kvs_name) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
rc = PMI2_Job_GetId(pmi_kvs_name, max_length);
|
||||
#else
|
||||
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
|
||||
#endif
|
||||
if (PMI_SUCCESS != rc) {
|
||||
return OMPI_ERROR;
|
||||
rc = mca_common_pmi_kvsname(pmi_kvs_name, max_length);
|
||||
if( OPAL_SUCCESS != rc ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
pmi_keylen_max = PMI2_MAX_KEYLEN;
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -259,7 +220,7 @@ static char* fetch_string(const char *key)
|
||||
tmp_val = (char*)malloc(pmi_vallen_max * sizeof(char));
|
||||
|
||||
/* the first section of the string has the original key, so fetch it */
|
||||
if (PMI_SUCCESS != kvs_get(key, tmp_val, pmi_vallen_max)) {
|
||||
if (OPAL_SUCCESS != kvs_get(key, tmp_val, pmi_vallen_max)) {
|
||||
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
|
||||
free(tmp_val);
|
||||
return NULL;
|
||||
@ -285,7 +246,7 @@ static char* fetch_string(const char *key)
|
||||
/* create the key */
|
||||
asprintf(&tmpkey, "%s:%d", key, i);
|
||||
/* fetch it */
|
||||
if (PMI_SUCCESS != kvs_get(tmpkey, tmp_val, pmi_vallen_max)) {
|
||||
if (OPAL_SUCCESS != kvs_get(tmpkey, tmp_val, pmi_vallen_max)) {
|
||||
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
|
||||
free(tmp_val);
|
||||
free(tmpkey);
|
||||
@ -441,11 +402,11 @@ ompi_rte_db_store(const ompi_process_name_t *proc,
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
|
||||
pmikey, pmidata);
|
||||
|
||||
if (PMI_SUCCESS != (rc = kvs_put(pmikey, pmidata))) {
|
||||
if ( OPAL_SUCCESS != (rc = kvs_put(pmikey, pmidata))) {
|
||||
free(pmidata);
|
||||
free(pmikey);
|
||||
opal_argv_free(strdata);
|
||||
return OMPI_ERROR;
|
||||
return rc;
|
||||
}
|
||||
free(pmidata);
|
||||
/* for each remaining segment, augment the key with the index */
|
||||
@ -456,10 +417,10 @@ ompi_rte_db_store(const ompi_process_name_t *proc,
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
|
||||
pmikey, strdata[i]);
|
||||
|
||||
if (PMI_SUCCESS != (rc = kvs_put(tmpkey, strdata[i]))) {
|
||||
if (OPAL_SUCCESS != (rc = kvs_put(tmpkey, strdata[i]))) {
|
||||
free(pmikey);
|
||||
opal_argv_free(strdata);
|
||||
return OMPI_ERROR;
|
||||
return rc;
|
||||
}
|
||||
free(tmpkey);
|
||||
}
|
||||
@ -518,8 +479,8 @@ ompi_rte_db_store(const ompi_process_name_t *proc,
|
||||
pmikey, pmidata);
|
||||
|
||||
rc = kvs_put(pmikey, pmidata);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
return OMPI_ERROR;
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
free(pmidata);
|
||||
free(pmikey);
|
||||
@ -629,7 +590,7 @@ ompi_rte_db_fetch(const struct ompi_proc_t *pptr,
|
||||
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
if (PMI_SUCCESS != kvs_get(pmikey, tmp_val, pmi_vallen_max)) {
|
||||
if (OPAL_SUCCESS != kvs_get(pmikey, tmp_val, pmi_vallen_max)) {
|
||||
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
|
||||
free(pmikey);
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
|
@ -13,10 +13,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -40,14 +37,14 @@ ompi_rte_abort(int error_code, char *fmt, ...)
|
||||
|
||||
va_end(ap);
|
||||
|
||||
PMI_Abort(error_code, msg);
|
||||
mca_common_pmi_abort(error_code, msg);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status)
|
||||
{
|
||||
PMI_Abort(status, "");
|
||||
mca_common_pmi_abort(status, "N/A");
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -12,10 +12,7 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
@ -24,6 +24,8 @@
|
||||
|
||||
AM_CPPFLAGS = $(common_pmi_CPPFLAGS)
|
||||
|
||||
dist_opaldata_DATA = help-common-pmi.txt
|
||||
|
||||
# control whether building an installed library or a convenience
|
||||
# (noinst) library
|
||||
if MCA_BUILD_opal_common_pmi_DSO
|
||||
@ -37,5 +39,9 @@ endif
|
||||
lib_LTLIBRARIES = $(component_install)
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_common_pmi_la_SOURCES = common_pmi.h common_pmi.c
|
||||
if WANT_PMI2_SUPPORT
|
||||
libmca_common_pmi_la_SOURCES += pmi2_pmap_parser.c
|
||||
endif
|
||||
|
||||
libmca_common_pmi_la_LDFLAGS = $(common_pmi_LDFLAGS) -version-info $(libmca_opal_common_pmi_so_version)
|
||||
libmca_common_pmi_la_LIBADD = $(common_pmi_LIBS)
|
||||
|
@ -18,6 +18,9 @@
|
||||
#include "opal/types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "pmi2_pmap_parser.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <pmi.h>
|
||||
@ -27,52 +30,214 @@
|
||||
|
||||
#include "common_pmi.h"
|
||||
|
||||
// usage accounting
|
||||
static int mca_common_pmi_init_count = 0;
|
||||
static int mca_common_pmi_init_size = 0;
|
||||
static int mca_common_pmi_init_rank = 0;
|
||||
|
||||
bool mca_common_pmi_init (void) {
|
||||
if (0 < mca_common_pmi_init_count++) {
|
||||
return true;
|
||||
}
|
||||
// per-launch selection between PMI versions
|
||||
static int mca_common_pmi_version = 0;
|
||||
|
||||
// PMI constant values:
|
||||
static int pmi_kvslen_max = 0;
|
||||
static int pmi_keylen_max = 0;
|
||||
static int pmi_vallen_max = 0;
|
||||
|
||||
// Job environment description
|
||||
static int pmi_size = 0;
|
||||
static int pmi_rank = 0;
|
||||
static int pmi_appnum = 0;
|
||||
static int pmi_usize = 0;
|
||||
static char *pmi_kvs_name = NULL;
|
||||
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
{
|
||||
int spawned, size, rank, appnum;
|
||||
static int mca_initialize_pmi_v2(void)
|
||||
{
|
||||
int spawned, size, rank, appnum;
|
||||
int rc, ret = OPAL_ERROR;
|
||||
|
||||
opal_output(0, "INIT PMI");
|
||||
|
||||
/* if we can't startup PMI, we can't be used */
|
||||
if (PMI2_Initialized ()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (PMI_SUCCESS == PMI2_Init(&spawned, &size, &rank, &appnum)) {
|
||||
mca_common_pmi_init_size = size;
|
||||
mca_common_pmi_init_rank = rank;
|
||||
mca_common_pmi_init_count--;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
/* deal with a Slurm bug by first checking if we were
|
||||
* even launched by a PMI server before attempting
|
||||
* to use PMI */
|
||||
if (NULL == getenv("PMI_FD")) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
#else
|
||||
{
|
||||
PMI_BOOL initialized;
|
||||
|
||||
if (PMI_SUCCESS != PMI_Initialized(&initialized)) {
|
||||
mca_common_pmi_init_count--;
|
||||
return false;
|
||||
/* if we can't startup PMI, we can't be used */
|
||||
if ( PMI2_Initialized () ) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
size = -1;
|
||||
rank = -1;
|
||||
appnum = -1;
|
||||
if (PMI2_SUCCESS != (rc = PMI2_Init(&spawned, &size, &rank, &appnum))) {
|
||||
opal_show_help("help-common-pmi.txt", "pmi2-init-failed", true, rc);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
if( size < 0 || rank < 0 ){
|
||||
opal_output(0, "SIZE %d RANK %d", size, rank);
|
||||
opal_show_help("help-common-pmi.txt", "pmi2-init-returned-bad-values", true);
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
|
||||
pmi_size = size;
|
||||
pmi_rank = rank;
|
||||
pmi_appnum = appnum;
|
||||
|
||||
pmi_vallen_max = PMI2_MAX_VALLEN;
|
||||
pmi_kvslen_max = PMI2_MAX_VALLEN; // FIX ME: What to put here for versatility?
|
||||
pmi_keylen_max = PMI2_MAX_KEYLEN;
|
||||
|
||||
|
||||
char buf[16];
|
||||
int found;
|
||||
|
||||
rc = PMI2_Info_GetJobAttr("universeSize", buf, 16, &found);
|
||||
if( PMI2_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_universe_size");
|
||||
goto err_exit;
|
||||
}
|
||||
pmi_usize = atoi(buf);
|
||||
|
||||
pmi_kvs_name = (char*)malloc(pmi_kvslen_max);
|
||||
if( pmi_kvs_name == NULL ){
|
||||
PMI2_Finalize();
|
||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto err_exit;
|
||||
}
|
||||
rc = PMI2_Job_GetId(pmi_kvs_name, pmi_kvslen_max);
|
||||
if( PMI2_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Job_GetId");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
err_exit:
|
||||
PMI2_Finalize();
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int mca_initialize_pmi_v1(void)
|
||||
{
|
||||
PMI_BOOL initialized;
|
||||
int spawned;
|
||||
int rc, ret = OPAL_ERROR;
|
||||
|
||||
/* deal with a Slurm bug by first checking if we were
|
||||
* even launched by a PMI server before attempting
|
||||
* to use PMI */
|
||||
if (NULL == getenv("PMI_FD")) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_Initialized(&initialized))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Initialized");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if( PMI_TRUE != initialized && PMI_SUCCESS != (rc = PMI_Init(&spawned)) ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Init");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
// Initialize space demands
|
||||
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Get_value_length_max");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rc = PMI_KVS_Get_name_length_max(&pmi_kvslen_max);
|
||||
if (PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
// Initialize job environment information
|
||||
rc = PMI_Get_rank(&pmi_rank);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_rank");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
rc = PMI_Get_universe_size(&pmi_usize);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_universe_size");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rc = PMI_Get_size(&pmi_size);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_size");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rc = PMI_Get_appnum(&pmi_appnum);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_appnum");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
pmi_kvs_name = (char*)malloc(pmi_kvslen_max);
|
||||
if( pmi_kvs_name == NULL ){
|
||||
ret = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rc = PMI_KVS_Get_my_name(pmi_kvs_name,pmi_kvslen_max);
|
||||
if( PMI_SUCCESS != rc ) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Job_GetId");
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
err_exit:
|
||||
PMI_Finalize();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int mca_common_pmi_init (int preferred_version) {
|
||||
int rc = OPAL_SUCCESS;
|
||||
if (0 < mca_common_pmi_init_count++) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
// Decide what version of PMI we want
|
||||
#if WANT_PMI2_SUPPORT
|
||||
{
|
||||
bool auto_select = !(preferred_version >= 1 && preferred_version <= 2);
|
||||
if( auto_select ){
|
||||
// choose PMIv2
|
||||
mca_common_pmi_version = 2;
|
||||
}else{
|
||||
mca_common_pmi_version = preferred_version;
|
||||
}
|
||||
|
||||
if (PMI_TRUE != initialized && PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
mca_common_pmi_init_count--;
|
||||
return false;
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
rc = mca_initialize_pmi_v2();
|
||||
if( !auto_select || rc == OPAL_SUCCESS ){
|
||||
// If we want exactly PMIv2 or we succeed
|
||||
if( rc != OPAL_SUCCESS ){
|
||||
mca_common_pmi_init_count--;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
mca_common_pmi_version = 1;
|
||||
if( OPAL_SUCCESS != (rc = mca_initialize_pmi_v1()) ){
|
||||
mca_common_pmi_init_count--;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
void mca_common_pmi_finalize (void) {
|
||||
@ -82,10 +247,14 @@ void mca_common_pmi_finalize (void) {
|
||||
|
||||
if (0 == --mca_common_pmi_init_count) {
|
||||
#if WANT_PMI2_SUPPORT
|
||||
PMI2_Finalize ();
|
||||
#else
|
||||
PMI_Finalize ();
|
||||
if( mca_common_pmi_version == 2){
|
||||
PMI2_Finalize ();
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
PMI_Finalize ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -120,33 +289,339 @@ char* opal_errmgr_base_pmi_error(int pmi_err)
|
||||
}
|
||||
|
||||
|
||||
bool mca_common_pmi_rank(int *rank) {
|
||||
|
||||
#if !WANT_PMI2_SUPPORT
|
||||
{
|
||||
int ret;
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&mca_common_pmi_init_rank))) {
|
||||
OPAL_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
*rank = mca_common_pmi_init_rank;
|
||||
return true;
|
||||
int mca_common_pmi_rank()
|
||||
{
|
||||
return pmi_rank;
|
||||
}
|
||||
|
||||
|
||||
bool mca_common_pmi_size(int *size) {
|
||||
int mca_common_pmi_size()
|
||||
{
|
||||
return pmi_size;
|
||||
}
|
||||
|
||||
#if !WANT_PMI2_SUPPORT
|
||||
int mca_common_pmi_appnum()
|
||||
{
|
||||
return pmi_appnum;
|
||||
}
|
||||
|
||||
|
||||
int mca_common_pmi_universe()
|
||||
{
|
||||
return pmi_usize;
|
||||
}
|
||||
|
||||
int mca_common_pmi_kvslen() {
|
||||
return pmi_kvslen_max;
|
||||
}
|
||||
|
||||
int mca_common_pmi_keylen()
|
||||
{
|
||||
return pmi_keylen_max;
|
||||
}
|
||||
|
||||
int mca_common_pmi_vallen()
|
||||
{
|
||||
return pmi_vallen_max;
|
||||
}
|
||||
|
||||
int mca_common_pmi_kvsname(char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
if( (unsigned)len < strnlen(pmi_kvs_name,pmi_kvslen_max) ){
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
for(i = 0; pmi_kvs_name[i]; i++){
|
||||
buf[i] = pmi_kvs_name[i];
|
||||
}
|
||||
buf[i] = '\0';
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_id(char **pmi_id_ret, char **error){
|
||||
char *pmi_id = NULL;
|
||||
int rc;
|
||||
|
||||
// Default values
|
||||
*pmi_id_ret = pmi_id;
|
||||
*error = NULL;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
// TODO: add proper error handling
|
||||
pmi_id = (char*)malloc(PMI2_MAX_VALLEN);
|
||||
if( pmi_id == NULL ){
|
||||
*error = "mca_common_pmi_id: could not get memory for PMIv2 ID";
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
strncpy(pmi_id, pmi_kvs_name, pmi_kvslen_max);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int ret;
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&mca_common_pmi_init_size))) {
|
||||
OPAL_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
return false;
|
||||
int pmi_maxlen;
|
||||
/* get our PMI id length */
|
||||
if (PMI_SUCCESS != (rc = PMI_Get_id_length_max(&pmi_maxlen))) {
|
||||
*error = "PMI_Get_id_length_max";
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
// TODO: add proper error handling
|
||||
pmi_id = (char*)malloc(pmi_maxlen);
|
||||
if( pmi_id == NULL ){
|
||||
*error = "mca_common_pmi_id: could not get memory for PMIv1 ID";
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* Get domain id */
|
||||
if (PMI_SUCCESS != (rc = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
|
||||
free(pmi_id);
|
||||
*error = "PMI_Get_kvs_domain_id";
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
*size = mca_common_pmi_init_size;
|
||||
return true;
|
||||
|
||||
*pmi_id_ret = pmi_id;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_local_info(int vpid, int **ranks_ret,
|
||||
int *procs_ret, char **error)
|
||||
{
|
||||
int *ranks;
|
||||
int procs = -1;
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if(mca_common_pmi_version == 2){
|
||||
|
||||
{
|
||||
char *pmapping = (char*)malloc(PMI2_MAX_VALLEN);
|
||||
if( pmapping == NULL ){
|
||||
*error = "mca_common_pmi_local_info: could not get memory for PMIv2 process mapping";
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
int found;
|
||||
int my_node;
|
||||
|
||||
rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found);
|
||||
if( !found || PMI2_SUCCESS != rc ) {
|
||||
/* can't check PMI2_SUCCESS as some folks (i.e., Cray) don't define it */
|
||||
OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr");
|
||||
*error = "mca_common_pmi_local_info: could not get PMI_process_mapping";
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
ranks = mca_common_pmi2_parse_pmap(pmapping, vpid, &my_node, &procs);
|
||||
if (NULL == ranks) {
|
||||
*error = "mca_common_pmi_local_info: could not get memory for PMIv2 local ranks";
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
free(pmapping);
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* get our local proc info to find our local rank */
|
||||
if (PMI_SUCCESS != (rc = PMI_Get_clique_size(&procs))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_clique_size");
|
||||
*error = "mca_common_pmi_local_info: could not get PMI clique size";
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
/* now get the specific ranks */
|
||||
ranks = (int*)calloc(procs, sizeof(int));
|
||||
if (NULL == ranks) {
|
||||
*error = "mca_common_pmi_local_info: could not get memory for local ranks";
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (PMI_SUCCESS != (rc = PMI_Get_clique_ranks(ranks, procs))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Get_clique_ranks");
|
||||
*error = "mca_common_pmi_local_info: could not get clique ranks";
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
*ranks_ret = ranks;
|
||||
*procs_ret = procs;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_common_pmi_abort(int status, char *msg)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2){
|
||||
PMI2_Abort(status, msg);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
PMI_Abort(status, msg);
|
||||
}
|
||||
}
|
||||
|
||||
int rc;
|
||||
|
||||
int mca_common_pmi_publish(const char *service_name, const char *port_name)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2){
|
||||
if (PMI2_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_publish");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Publish_name");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_lookup(const char *service_name, char **port_ret)
|
||||
{
|
||||
// FIXME:
|
||||
// 1. Why don't we malloc memory for the port for PMI v1?
|
||||
// 2. Maybe error handling is needed in pbusub?
|
||||
// 3. Is it legal to call OPAL_PMI_ERROR for PMIv2 rc?
|
||||
|
||||
char *port = NULL;
|
||||
*port_ret = port;
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */
|
||||
if( port == NULL ){
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (PMI_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_lookup");
|
||||
free(port);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// Allocate mem for port here? Otherwise we won't get success!
|
||||
// SLURM PMIv1 doesn't implement this function
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Lookup_name");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
*port_ret = port;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_unpublish ( const char *service_name )
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
if (PMI2_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (PMI_SUCCESS != (rc = PMI_Unpublish_name(service_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;;
|
||||
}
|
||||
|
||||
int mca_common_pmi_barrier()
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
/* PMI2 doesn't provide a barrier, so use the Fence function here */
|
||||
if (PMI2_SUCCESS != (rc = PMI2_KVS_Fence())) {
|
||||
// FIX ME: OPAL_PMI2_ERROR(rc, "PMI2_KVS_Fence");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* use the PMI barrier function */
|
||||
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Barrier");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_put(const char *kvs_name,
|
||||
const char *key, const char *value)
|
||||
{
|
||||
int rc;
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
if( PMI2_SUCCESS != PMI2_KVS_Put(key, value) ){
|
||||
// FIXME: OPAL_PMI2_ERROR(rc, "PMI2_KVS_Put");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
rc = PMI_KVS_Put(kvs_name, key, value);
|
||||
if( PMI_SUCCESS != rc ){
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_get(const char *kvs_name, const char *key,
|
||||
char *value, int valuelen)
|
||||
{
|
||||
int rc;
|
||||
#if WANT_PMI2_SUPPORT
|
||||
if( mca_common_pmi_version == 2 ){
|
||||
int len;
|
||||
rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
|
||||
if( PMI2_SUCCESS != rc ){
|
||||
// OPAL_PMI2_ERROR(rc, "PMI_KVS_Put");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
rc = PMI_KVS_Get(kvs_name, key, value, valuelen);
|
||||
if( PMI_SUCCESS != rc ){
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_common_pmi_commit(char *kvs_name)
|
||||
{
|
||||
if( mca_common_pmi_version == 1 ){
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(kvs_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Commit");
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
return mca_common_pmi_barrier();
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,14 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef COMMON_PMI_H
|
||||
#define COMMON_PMI_H
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#if !defined(OPAL_MCA_COMMON_PMI)
|
||||
#define OPAL_MCA_COMMON_PMI
|
||||
|
||||
@ -26,7 +34,7 @@
|
||||
* @retval true PMI successfully initialized
|
||||
* @retval false PMI could not be initialized
|
||||
*/
|
||||
bool mca_common_pmi_init (void);
|
||||
int mca_common_pmi_init (int preferred_version);
|
||||
|
||||
/**
|
||||
* mca_common_pmi_finalize:
|
||||
@ -38,13 +46,41 @@ void mca_common_pmi_finalize (void);
|
||||
|
||||
#define OPAL_PMI_ERROR(pmi_err, pmi_func) \
|
||||
do { \
|
||||
opal_output(0, "[%s:%d:%s] %s: %s\n", \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
pmi_func, opal_errmgr_base_pmi_error(pmi_err)); \
|
||||
opal_output(0, "%s [%s:%d:%s]: %s\n", \
|
||||
pmi_func, __FILE__, __LINE__, __func__, \
|
||||
opal_errmgr_base_pmi_error(pmi_err)); \
|
||||
} while(0);
|
||||
|
||||
OPAL_DECLSPEC char* opal_errmgr_base_pmi_error(int pmi_err);
|
||||
|
||||
#endif
|
||||
int mca_common_pmi_rank(void);
|
||||
int mca_common_pmi_size(void);
|
||||
int mca_common_pmi_appnum(void);
|
||||
int mca_common_pmi_universe(void);
|
||||
int mca_common_pmi_kvsname(char *buf, int len);
|
||||
|
||||
bool mca_common_pmi_rank(int *rank);
|
||||
bool mca_common_pmi_size(int *size);
|
||||
int mca_common_pmi_kvslen(void);
|
||||
int mca_common_pmi_keylen(void);
|
||||
int mca_common_pmi_vallen(void);
|
||||
|
||||
int mca_common_pmi_id(char **pmi_id_ret, char **error);
|
||||
int mca_common_pmi_local_info(int vpid, int **ranks_ret,
|
||||
int *procs_ret, char **error);
|
||||
void mca_common_pmi_abort(int status, char *msg);
|
||||
|
||||
// Publish-subscribe operations
|
||||
int mca_common_pmi_publish(const char *service_name, const char *port_name);
|
||||
int mca_common_pmi_lookup(const char *service_name, char **port_ret);
|
||||
int mca_common_pmi_unpublish ( const char *service_name );
|
||||
|
||||
// KVS put/get
|
||||
int mca_common_pmi_put(const char *kvs_name,
|
||||
const char *key, const char *value);
|
||||
|
||||
int mca_common_pmi_get(const char *kvs_name, const char *key,
|
||||
char *value, int valuelen);
|
||||
int mca_common_pmi_commit(char *kvs_name);
|
||||
int mca_common_pmi_barrier(void);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
24
opal/mca/common/pmi/help-common-pmi.txt
Обычный файл
24
opal/mca/common/pmi/help-common-pmi.txt
Обычный файл
@ -0,0 +1,24 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[pmi2-init-failed]
|
||||
PMI2 failed to initialize, returning an error code of %d.
|
||||
We cannot use PMI2 at this time, and your job will
|
||||
likely abort.
|
||||
#
|
||||
[pmi2-init-returned-bad-values]
|
||||
PMI2 initialized but returned bad values for size and rank.
|
||||
This is symptomatic of either a failure to use the
|
||||
"--mpi=pmi2" flag in SLURM, or a borked PMI2 installation.
|
||||
If running under SLURM, try adding "-mpi=pmi2" to your
|
||||
srun command line. If that doesn't work, or if you are
|
||||
not running under SLURM, try removing or renaming the
|
||||
pmi2.h header file so PMI2 support will not automatically
|
||||
be built, reconfigure and build OMPI, and then try again
|
||||
with only PMI1 support enabled.
|
@ -10,14 +10,10 @@
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "pmi2_pmap_parser.h"
|
||||
#ifdef STANDALONE_TEST
|
||||
#define WANT_PMI2_SUPPORT 1
|
||||
#else
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "grpcomm_pmi.h"
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -134,7 +130,7 @@ static int *find_lrs(char *map, int my_node, int *nlrs)
|
||||
* @return array that contains ranks local to my_rank or NULL
|
||||
* on failure. Array must be freed by the caller.
|
||||
*/
|
||||
int *orte_grpcomm_pmi2_parse_pmap(char *pmap, int my_rank,
|
||||
int *mca_common_pmi2_parse_pmap(char *pmap, int my_rank,
|
||||
int *node, int *nlrs)
|
||||
{
|
||||
char *p;
|
18
opal/mca/common/pmi/pmi2_pmap_parser.h
Обычный файл
18
opal/mca/common/pmi/pmi2_pmap_parser.h
Обычный файл
@ -0,0 +1,18 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef PMI2_PMAP_PARSER_H
|
||||
#define PMI2_PMAP_PARSER_H
|
||||
|
||||
int *mca_common_pmi2_parse_pmap(char *pmap, int my_rank,
|
||||
int *node, int *nlrs);
|
||||
#endif
|
@ -15,10 +15,8 @@
|
||||
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include <regex.h>
|
||||
|
||||
@ -30,7 +28,6 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
#include "opal/mca/dstore/base/base.h"
|
||||
#include "dstore_pmi.h"
|
||||
|
||||
@ -73,26 +70,16 @@ static char* setup_key(mca_dstore_pmi_module_t *mod,
|
||||
* PMI functions, we define a set of wrappers for those
|
||||
* common functions we will use
|
||||
*/
|
||||
static int kvs_put(mca_dstore_pmi_module_t *mod,
|
||||
static inline int kvs_put(mca_dstore_pmi_module_t *mod,
|
||||
const char *key, const char *value)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
return PMI2_KVS_Put(key, value);
|
||||
#else
|
||||
return PMI_KVS_Put(mod->pmi_kvs_name, key, value);
|
||||
#endif
|
||||
return mca_common_pmi_put(mod->pmi_kvs_name, key, value);
|
||||
}
|
||||
|
||||
static int kvs_get(mca_dstore_pmi_module_t *mod,
|
||||
static inline int kvs_get(mca_dstore_pmi_module_t *mod,
|
||||
const char *key, char *value, int valuelen)
|
||||
{
|
||||
#if WANT_PMI2_SUPPORT
|
||||
int len;
|
||||
|
||||
return PMI2_KVS_Get(mod->pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
|
||||
#else
|
||||
return PMI_KVS_Get(mod->pmi_kvs_name, key, value, valuelen);
|
||||
#endif
|
||||
return mca_common_pmi_get(mod->pmi_kvs_name, key, value, valuelen);
|
||||
}
|
||||
|
||||
static void finalize(struct opal_dstore_base_module_t *imod)
|
||||
@ -164,9 +151,7 @@ static int pmi_commit_packed(mca_dstore_pmi_module_t *mod,
|
||||
|
||||
rc = kvs_put(mod, pmikey, tmp);
|
||||
free(pmikey);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
rc = OPAL_ERROR;
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -265,7 +250,7 @@ static int pmi_get_packed(mca_dstore_pmi_module_t *mod,
|
||||
|
||||
pmi_tmp = calloc (mod->pmi_vallen_max, 1);
|
||||
if (NULL == pmi_tmp) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* read all of the packed data from this proc */
|
||||
@ -275,7 +260,7 @@ static int pmi_get_packed(mca_dstore_pmi_module_t *mod,
|
||||
sprintf (tmp_key, "key%d", remote_key);
|
||||
|
||||
if (NULL == (pmikey = setup_key(mod, proc, tmp_key))) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
OPAL_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -284,21 +269,21 @@ static int pmi_get_packed(mca_dstore_pmi_module_t *mod,
|
||||
"GETTING KEY %s", pmikey));
|
||||
|
||||
rc = kvs_get(mod, pmikey, pmi_tmp, mod->pmi_vallen_max);
|
||||
free (pmikey);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
break;
|
||||
free (pmikey);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
}
|
||||
|
||||
size = strlen (pmi_tmp);
|
||||
size = strlen (pmi_tmp);
|
||||
|
||||
if (NULL == tmp_encoded) {
|
||||
tmp_encoded = malloc (size + 1);
|
||||
} else {
|
||||
tmp_encoded = realloc (tmp_encoded, bytes_read + size + 1);
|
||||
}
|
||||
if (NULL == tmp_encoded) {
|
||||
tmp_encoded = malloc (size + 1);
|
||||
} else {
|
||||
tmp_encoded = realloc (tmp_encoded, bytes_read + size + 1);
|
||||
}
|
||||
|
||||
strcpy (tmp_encoded + bytes_read, pmi_tmp);
|
||||
bytes_read += size;
|
||||
strcpy (tmp_encoded + bytes_read, pmi_tmp);
|
||||
bytes_read += size;
|
||||
|
||||
/* is the string terminator present? */
|
||||
if ('-' == tmp_encoded[bytes_read-1]) {
|
||||
@ -316,7 +301,7 @@ static int pmi_get_packed(mca_dstore_pmi_module_t *mod,
|
||||
*packed_data = (char *) pmi_decode (tmp_encoded, len);
|
||||
free (tmp_encoded);
|
||||
if (NULL == *packed_data) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
@ -497,20 +482,11 @@ static void commit(struct opal_dstore_base_module_t *imod,
|
||||
/* commit the packed data to PMI */
|
||||
pmi_commit_packed(mod, id);
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
PMI2_KVS_Fence();
|
||||
#else
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(mod->pmi_kvs_name))) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_KVS_Commit");
|
||||
return;
|
||||
}
|
||||
/* Barrier here to ensure all other procs have committed */
|
||||
PMI_Barrier();
|
||||
int rc = mca_common_pmi_commit(mod->pmi_kvs_name);
|
||||
if( OPAL_SUCCESS != rc ){
|
||||
// TODO: What we do here? failure exit?
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static int fetch(struct opal_dstore_base_module_t *imod,
|
||||
|
@ -11,17 +11,13 @@
|
||||
#include "opal_config.h"
|
||||
#include "opal/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "opal/mca/dstore/dstore.h"
|
||||
#include "opal/mca/dstore/base/base.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "dstore_pmi.h"
|
||||
|
||||
static int dstore_pmi_component_register(void);
|
||||
@ -85,7 +81,8 @@ static bool component_avail(void)
|
||||
* will force our selection if we are direct-launched,
|
||||
* and the orted will turn us "off" if indirectly launched
|
||||
*/
|
||||
if (mca_common_pmi_init() && OPAL_SUCCESS == setup_pmi()) {
|
||||
int rc = mca_common_pmi_init(opal_pmi_version);
|
||||
if ( OPAL_SUCCESS == rc && OPAL_SUCCESS == setup_pmi()) {
|
||||
return true;
|
||||
}
|
||||
/* if not, then we are not available */
|
||||
@ -130,56 +127,23 @@ static int setup_pmi(void)
|
||||
{
|
||||
int max_length, rc;
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
pmi_vallen_max = PMI2_MAX_VALLEN;
|
||||
max_length = PMI2_MAX_VALLEN;
|
||||
#else
|
||||
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_dstore_base_framework.framework_output,
|
||||
"dstore:pmi:pmi_setup failed %s with error %s",
|
||||
"PMI_Get_value_length_max",
|
||||
opal_errmgr_base_pmi_error(rc)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_dstore_base_framework.framework_output,
|
||||
"dstore:pmi:pmi_setup failed %s with error %s",
|
||||
"PMI_KVS_Get_name_length_max",
|
||||
opal_errmgr_base_pmi_error(rc)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_vallen_max = mca_common_pmi_vallen();
|
||||
max_length = mca_common_pmi_kvslen();
|
||||
pmi_kvs_name = (char*)malloc(max_length);
|
||||
if (NULL == pmi_kvs_name) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
rc = PMI2_Job_GetId(pmi_kvs_name, max_length);
|
||||
#else
|
||||
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
|
||||
#endif
|
||||
if (PMI_SUCCESS != rc) {
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_dstore_base_framework.framework_output,
|
||||
"dstore:pmi:pmi_setup failed %s with error %s on maxlength %d",
|
||||
"PMI_KVS_Get_my_name",
|
||||
opal_errmgr_base_pmi_error(rc), max_length));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
pmi_keylen_max = PMI2_MAX_KEYLEN;
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) {
|
||||
rc = mca_common_pmi_kvsname(pmi_kvs_name, max_length);
|
||||
if( OPAL_SUCCESS != rc ){
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_dstore_base_framework.framework_output,
|
||||
"dstore:pmi:pmi_setup failed %s with error %s",
|
||||
"PMI_KVS_Get_key_length_max",
|
||||
"mca_common_pmi_jobname",
|
||||
opal_errmgr_base_pmi_error(rc)));
|
||||
return OPAL_ERROR;
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
pmi_keylen_max = mca_common_pmi_keylen();
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -45,6 +45,7 @@
|
||||
char *opal_signal_string = NULL;
|
||||
char *opal_net_private_ipv4 = NULL;
|
||||
char *opal_set_max_sys_limits = NULL;
|
||||
int opal_pmi_version = 0;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
bool opal_base_distill_checkpoint_ready = false;
|
||||
@ -179,6 +180,18 @@ int opal_register_params(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_pmi_version = 0;
|
||||
#ifdef WANT_PMI2_SUPPORT
|
||||
(void) mca_base_var_register ("opal", "opal", NULL, "pmi_version",
|
||||
"Set preferred PMI version: 0 => auto detect, 1 = PMIv1, 2 = PMIv2",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||
&opal_pmi_version);
|
||||
#else
|
||||
opal_pmi_version = 1;
|
||||
#endif
|
||||
|
||||
|
||||
/* The ddt engine has a few parameters */
|
||||
ret = opal_datatype_register_params();
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
|
@ -29,6 +29,7 @@
|
||||
extern char *opal_signal_string;
|
||||
extern char *opal_net_private_ipv4;
|
||||
extern char *opal_set_max_sys_limits;
|
||||
extern int opal_pmi_version;
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
extern bool opal_progress_debug;
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -65,7 +66,7 @@ static int pmi_component_open(void)
|
||||
static int pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* we are available anywhere PMI is available, but not for HNP itself */
|
||||
if (!ORTE_PROC_IS_HNP && mca_common_pmi_init()) {
|
||||
if (!ORTE_PROC_IS_HNP && OPAL_SUCCESS == mca_common_pmi_init(opal_pmi_version)) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 35;
|
||||
*module = (mca_base_module_t *)&orte_ess_pmi_module;
|
||||
|
@ -38,11 +38,6 @@
|
||||
#include <ifaddrs.h>
|
||||
#endif
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -134,17 +129,11 @@ static int rte_init(void)
|
||||
}
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
/* get our rank from PMI */
|
||||
if (!mca_common_pmi_rank(&i)) {
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
i = mca_common_pmi_rank();
|
||||
ORTE_PROC_MY_NAME->vpid = i + 1; /* compensate for orterun */
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (!mca_common_pmi_size(&i)) {
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
i = mca_common_pmi_universe();
|
||||
orte_process_info.num_procs = i + 1; /* compensate for orterun */
|
||||
|
||||
/* complete setup */
|
||||
@ -158,31 +147,10 @@ static int rte_init(void)
|
||||
}
|
||||
|
||||
/* we are a direct-launched MPI process */
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
/* Get domain id */
|
||||
pmi_id = (char*)malloc(PMI2_MAX_VALLEN);
|
||||
if (PMI_SUCCESS != (ret = PMI2_Job_GetId(pmi_id, PMI2_MAX_VALLEN))) {
|
||||
error = "PMI2_Job_GetId failed";
|
||||
if( OPAL_SUCCESS != (ret = mca_common_pmi_id(&pmi_id, &error)) ){
|
||||
goto error;
|
||||
}
|
||||
#else
|
||||
{
|
||||
int pmi_maxlen;
|
||||
|
||||
/* get our PMI id length */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
|
||||
error = "PMI_Get_id_length_max";
|
||||
goto error;
|
||||
}
|
||||
pmi_id = (char*)malloc(pmi_maxlen);
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
|
||||
free(pmi_id);
|
||||
error = "PMI_Get_kvs_domain_id";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* PMI is very nice to us - the domain id is an integer followed
|
||||
* by a '.', followed by essentially a stepid. The first integer
|
||||
* defines an overall job number. The second integer is the number of
|
||||
@ -204,17 +172,11 @@ static int rte_init(void)
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
|
||||
/* get our rank */
|
||||
if (!mca_common_pmi_rank(&i)) {
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
i = mca_common_pmi_rank();
|
||||
ORTE_PROC_MY_NAME->vpid = i;
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (!mca_common_pmi_size(&i)) {
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
// FIX ME: What do we need here - size or universe?
|
||||
i = mca_common_pmi_universe();
|
||||
orte_process_info.num_procs = i;
|
||||
/* push into the environ for pickup in MPI layer for
|
||||
* MPI-3 required info key
|
||||
@ -267,69 +229,10 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
{
|
||||
/* get our local proc info to find our local rank */
|
||||
char *pmapping = (char*)malloc(PMI2_MAX_VALLEN);
|
||||
int found, sid, nodes, k;
|
||||
orte_vpid_t n;
|
||||
char *p;
|
||||
ret = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found);
|
||||
if (!found || PMI_SUCCESS != ret) { /* can't check PMI2_SUCCESS as some folks (i.e., Cray) don't define it */
|
||||
error = "could not get PMI_process_mapping (PMI2_Info_GetJobAttr() failed)";
|
||||
goto error;
|
||||
}
|
||||
|
||||
i = 0; n = 0; procs = 0;
|
||||
if (NULL != (p = strstr(pmapping, "(vector"))) {
|
||||
while (NULL != (p = strstr(p+1, ",("))) {
|
||||
if (3 == sscanf(p, ",(%d,%d,%d)", &sid, &nodes, &procs)) {
|
||||
for (k = 0; k < nodes; k++) {
|
||||
if ((ORTE_PROC_MY_NAME->vpid >= n) &&
|
||||
(ORTE_PROC_MY_NAME->vpid < (n + procs))) {
|
||||
break;
|
||||
}
|
||||
n += procs;
|
||||
}
|
||||
} else {
|
||||
procs = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(pmapping);
|
||||
|
||||
if (0 < procs) {
|
||||
ranks = (int*)malloc(procs * sizeof(int));
|
||||
for (i=0; i < procs; i++) {
|
||||
ranks[i] = n + i;
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL == ranks) {
|
||||
error = "could not get PMI_process_mapping";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
#else
|
||||
/* get our local proc info to find our local rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&procs))) {
|
||||
OPAL_PMI_ERROR(ret, "PMI_Get_clique_size");
|
||||
error = "could not get PMI clique size";
|
||||
ret = mca_common_pmi_local_info(ORTE_PROC_MY_NAME->vpid, &ranks, &procs, &error);
|
||||
if( OPAL_SUCCESS != ret ){
|
||||
goto error;
|
||||
}
|
||||
/* now get the specific ranks */
|
||||
ranks = (int*)calloc(procs, sizeof(int));
|
||||
if (NULL == ranks) {
|
||||
error = "could not get memory for local ranks";
|
||||
ret = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, procs))) {
|
||||
OPAL_PMI_ERROR(ret, "PMI_Get_clique_ranks");
|
||||
error = "could not get clique ranks";
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
/* store the number of local peers - remember, we want the number
|
||||
* of peers that share the node WITH ME, so we have to subtract
|
||||
* ourselves from that number
|
||||
@ -533,11 +436,7 @@ static void rte_abort(int status, bool report)
|
||||
/* PMI doesn't like NULL messages, but our interface
|
||||
* doesn't provide one - so rig one up here
|
||||
*/
|
||||
#if WANT_PMI2_SUPPORT
|
||||
PMI2_Abort(status, "N/A");
|
||||
#else
|
||||
PMI_Abort(status, "N/A");
|
||||
#endif
|
||||
mca_common_pmi_abort(status, "N/A");
|
||||
|
||||
/* - Clean out the global structures
|
||||
* (not really necessary, but good practice) */
|
||||
|
@ -17,10 +17,6 @@ sources = \
|
||||
grpcomm_pmi_module.c \
|
||||
grpcomm_pmi_component.c
|
||||
|
||||
if WANT_PMI2_SUPPORT
|
||||
sources += pmi2_pmap_parser.c
|
||||
endif
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -80,7 +81,7 @@ int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
* selection will have been turned "off" for us
|
||||
*/
|
||||
if (ORTE_PROC_IS_APP &&
|
||||
mca_common_pmi_init()) {
|
||||
OPAL_SUCCESS == mca_common_pmi_init(opal_pmi_version)) {
|
||||
/* if PMI is available, make it available for use by MPI procs */
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
|
||||
|
@ -18,13 +18,10 @@
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_PMI2_SUPPORT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/common/pmi/common_pmi.h"
|
||||
#include "opal/mca/dstore/dstore.h"
|
||||
|
||||
@ -62,7 +59,7 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = {
|
||||
*/
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
return mca_common_pmi_init(opal_pmi_version);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -70,6 +67,7 @@ static int init(void)
|
||||
*/
|
||||
static void finalize(void)
|
||||
{
|
||||
mca_common_pmi_finalize();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -107,19 +105,9 @@ static int pmi_barrier(orte_grpcomm_collective_t *coll)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if WANT_PMI2_SUPPORT
|
||||
/* PMI2 doesn't provide a barrier, so use the Fence function here */
|
||||
if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) {
|
||||
OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence");
|
||||
return ORTE_ERROR;
|
||||
if( OPAL_SUCCESS != (rc = mca_common_pmi_barrier()) ){
|
||||
return rc;
|
||||
}
|
||||
#else
|
||||
/* use the PMI barrier function */
|
||||
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Barrier");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
|
||||
"%s grpcomm:pmi barrier complete",
|
||||
@ -152,58 +140,20 @@ static int modex(orte_grpcomm_collective_t *coll)
|
||||
int rc, i;
|
||||
opal_list_t myvals;
|
||||
opal_value_t *kv, kvn;
|
||||
char *error;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
||||
"%s grpcomm:pmi: modex entered",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* discover the local ranks */
|
||||
#if WANT_PMI2_SUPPORT
|
||||
{
|
||||
char *pmapping = (char*)malloc(PMI2_MAX_VALLEN);
|
||||
int found;
|
||||
int my_node;
|
||||
|
||||
rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found);
|
||||
if (!found || PMI_SUCCESS != rc) { /* can't check PMI2_SUCCESS as some folks (i.e., Cray) don't define it */
|
||||
opal_output(0, "%s could not get PMI_process_mapping (PMI2_Info_GetJobAttr() failed)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
local_ranks = orte_grpcomm_pmi2_parse_pmap(pmapping, ORTE_PROC_MY_NAME->vpid, &my_node, &local_rank_count);
|
||||
if (NULL == local_ranks) {
|
||||
opal_output(0, "%s could not get PMI_process_mapping",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
||||
"%s: pmapping: %s my_node=%d lr_count=%d\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pmapping, my_node, local_rank_count));
|
||||
|
||||
free(pmapping);
|
||||
rc = mca_common_pmi_local_info(ORTE_PROC_MY_NAME->vpid, &local_ranks,
|
||||
&local_rank_count, &error);
|
||||
if( OPAL_SUCCESS != rc){
|
||||
opal_output(0, "%s could not get PMI_process_mapping: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), error);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#else
|
||||
rc = PMI_Get_clique_size (&local_rank_count);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
local_ranks = calloc (local_rank_count, sizeof (int));
|
||||
if (NULL == local_ranks) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
rc = PMI_Get_clique_ranks (local_ranks, local_rank_count);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* our RTE data was constructed and pushed in the ESS pmi component */
|
||||
|
||||
|
@ -102,6 +102,7 @@ char *orte_set_slots = NULL;
|
||||
bool orte_display_allocation;
|
||||
bool orte_display_devel_allocation;
|
||||
bool orte_soft_locations = false;
|
||||
int orted_pmi_version = 0;
|
||||
|
||||
/* launch agents */
|
||||
char *orte_launch_agent = NULL;
|
||||
|
@ -703,6 +703,9 @@ ORTE_DECLSPEC extern char *orte_rankfile;
|
||||
ORTE_DECLSPEC extern int orte_num_allocated_nodes;
|
||||
ORTE_DECLSPEC extern char *orte_node_regex;
|
||||
|
||||
/* PMI version control */
|
||||
ORTE_DECLSPEC extern int orted_pmi_version;
|
||||
|
||||
/* tool communication controls */
|
||||
ORTE_DECLSPEC extern bool orte_report_events;
|
||||
ORTE_DECLSPEC extern char *orte_report_events_uri;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user