1
1

Complete implementation of pmi support. Ensure we support both mpirun and direct launch within same configuration to avoid requiring separate builds. Add support for generic pmi, not just under slurm. Add publish/subscribe support, although slurm's pmi implementation will just return an error as it hasn't been done yet.

This commit was SVN r25303.
Этот коммит содержится в:
Ralph Castain 2011-10-17 20:51:22 +00:00
родитель d7a8553179
Коммит 8f0ef54130
19 изменённых файлов: 1390 добавлений и 280 удалений

39
ompi/mca/pubsub/pmi/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,39 @@
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(pubsub_pmi_CPPFLAGS)
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_pubsub_pmi_DSO
component_noinst =
component_install = mca_pubsub_pmi.la
else
component_noinst = libmca_pubsub_pmi.la
component_install =
endif
local_sources = \
pubsub_pmi.c \
pubsub_pmi.h \
pubsub_pmi_component.c
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pubsub_pmi_la_SOURCES = $(local_sources)
mca_pubsub_pmi_la_LDFLAGS = -module -avoid-version $(pubsub_pmi_LDFLAGS)
mca_pubsub_pmi_la_LIBADD = $(pubsub_pmi_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_pubsub_pmi_la_SOURCES = $(local_sources)
libmca_pubsub_pmi_la_LIBADD = $(pubsub_pmi_LIBS)
libmca_pubsub_pmi_la_LDFLAGS = -module -avoid-version $(pubsub_pmi_LDFLAGS)

32
ompi/mca/pubsub/pmi/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,32 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_ompi_pubsub_pmi_PRIORITY], [10])
# MCA_ompi_pubsub_pmi_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_ompi_pubsub_pmi_CONFIG], [
AC_CONFIG_FILES([ompi/mca/pubsub/pmi/Makefile])
ORTE_CHECK_PMI([pubsub_pmi], [pubsub_pmi_good=1], [pubsub_pmi_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$pubsub_pmi_good" = 1],
[pubsub_pmi_WRAPPER_EXTRA_LDFLAGS="$pubsub_pmi_LDFLAGS"
pubsub_pmi_WRAPPER_EXTRA_LIBS="$pubsub_pmi_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([pubsub_pmi_CPPFLAGS])
AC_SUBST([pubsub_pmi_LDFLAGS])
AC_SUBST([pubsub_pmi_LIBS])
])

127
ompi/mca/pubsub/pmi/pubsub_pmi.c Обычный файл
Просмотреть файл

@ -0,0 +1,127 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include <pmi.h>
#include "ompi/info/info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "ompi/mca/pubsub/base/base.h"
#include "pubsub_pmi.h"
static char* pmi_error(int pmi_err);
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
do { \
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \
pmi_func, pmi_error(pmi_err)); \
} while(0);
/*
* Init the module
*/
static int init(void)
{
return OMPI_SUCCESS;
}
/*
* publish the port_name for the specified service_name.
*/
static int publish ( char *service_name, ompi_info_t *info, char *port_name )
{
int rc;
if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Publish_name");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static char* lookup ( char *service_name, ompi_info_t *info )
{
char *port=NULL;
int rc;
if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) {
ORTE_PMI_ERROR(rc, "PMI_Lookup_name");
return NULL;
}
return port;
}
/*
* delete the entry */
static int unpublish ( char *service_name, ompi_info_t *info )
{
int rc;
if (PMI_SUCCESS != (rc = PMI_Unpublish_name(service_name))) {
ORTE_PMI_ERROR(rc, "PMI_Unpublish_name");
return OMPI_ERROR;
}
return OMPI_SUCCESS;;
}
/*
* finalize the module
*/
static int finalize(void)
{
return OMPI_SUCCESS;
}
/*
* instantiate the module
*/
ompi_pubsub_base_module_t ompi_pubsub_pmi_module = {
init,
publish,
unpublish,
lookup,
finalize
};
/* useful util */
static char* pmi_error(int pmi_err)
{
char * err_msg;
switch(pmi_err) {
case PMI_FAIL: err_msg = "Operation failed"; break;
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
case PMI_SUCCESS: err_msg = "Success"; break;
default: err_msg = "Unkown error";
}
return err_msg;
}

29
ompi/mca/pubsub/pmi/pubsub_pmi.h Обычный файл
Просмотреть файл

@ -0,0 +1,29 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_PUBSUB_PMI_H
#define OMPI_PUBSUB_PMI_H
#include "ompi_config.h"
#include "orte/types.h"
#include "ompi/mca/pubsub/pubsub.h"
BEGIN_C_DECLS
/* access to module */
extern ompi_pubsub_base_module_t ompi_pubsub_pmi_module;
/* access to component */
OMPI_MODULE_DECLSPEC extern ompi_pubsub_base_component_t mca_pubsub_pmi_component;
END_C_DECLS
#endif /* OMPI_PUBSUB_PMI_H */

88
ompi/mca/pubsub/pmi/pubsub_pmi_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,88 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include <pmi.h>
#include "orte/util/proc_info.h"
#include "pubsub_pmi.h"
static int pubsub_pmi_component_open(void);
static int pubsub_pmi_component_close(void);
static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority);
ompi_pubsub_base_component_t mca_pubsub_pmi_component = {
{
OMPI_PUBSUB_BASE_VERSION_2_0_0,
"pmi", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
pubsub_pmi_component_open, /* component open */
pubsub_pmi_component_close, /* component close */
pubsub_pmi_component_query /* component query */
},
{
/* This component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int pubsub_pmi_component_open(void)
{
return OMPI_SUCCESS;
}
static int pubsub_pmi_component_close(void)
{
PMI_BOOL initialized;
/* if we weren't selected, cleanup if necessary */
if (PMI_SUCCESS == PMI_Initialized(&initialized) &&
PMI_TRUE == initialized) {
PMI_Finalize();
}
return OMPI_SUCCESS;
}
static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority)
{
int spawned;
PMI_BOOL initialized;
/* for now, only use PMI when direct launched */
if (NULL == orte_process_info.my_hnp_uri &&
PMI_SUCCESS == PMI_Initialized(&initialized)) {
/* if we aren't already initialized, then try */
if (PMI_TRUE != initialized) {
/* if we can't startup the PMI, we can't be used */
if (PMI_SUCCESS != PMI_Init(&spawned)) {
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}
}
/* if we were able to startup PMI, or it was already
* running, then use us
*/
*priority = 100;
*module = (mca_base_module_t *)&ompi_pubsub_pmi_module;
return ORTE_SUCCESS;
}
/* we can't run */
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -119,11 +119,6 @@ int ompi_proc_init(void)
if (OMPI_SUCCESS != (ret = ompi_modex_send_key_value("OMPI_ARCH", &proc->proc_arch, OPAL_UINT32))) {
return ret;
}
} else {
/* get the locality information */
proc->proc_flags = orte_ess.proc_get_locality(&proc->proc_name);
/* get the name of the node it is on */
proc->proc_hostname = orte_ess.proc_get_hostname(&proc->proc_name);
}
}
@ -149,8 +144,8 @@ int ompi_proc_set_arch(void)
OPAL_THREAD_LOCK(&ompi_proc_lock);
for( item = opal_list_get_first(&ompi_proc_list);
item != opal_list_get_end(&ompi_proc_list);
item = opal_list_get_next(item)) {
item != opal_list_get_end(&ompi_proc_list);
item = opal_list_get_next(item)) {
proc = (ompi_proc_t*)item;
if (proc->proc_name.vpid != ORTE_PROC_MY_NAME->vpid) {
@ -177,6 +172,10 @@ int ompi_proc_set_arch(void)
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return ret;
}
/* get the locality information */
proc->proc_flags = orte_ess.proc_get_locality(&proc->proc_name);
/* get the name of the node it is on */
proc->proc_hostname = orte_ess.proc_get_hostname(&proc->proc_name);
}
}
OPAL_THREAD_UNLOCK(&ompi_proc_lock);

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
@ -88,4 +88,5 @@ AC_DEFUN([ORTE_CHECK_PMI],[
AC_DEFINE_UNQUOTED([WANT_PMI_SUPPORT],
[$orte_enable_pmi],
[Whether we want PMI support])
AM_CONDITIONAL(WANT_PMI_SUPPORT, [test "$orte_enable_pmi" = 1])
])

38
orte/mca/ess/pmi/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(ess_pmi_CPPFLAGS)
sources = \
ess_pmi.h \
ess_pmi_component.c \
ess_pmi_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_ess_pmi_DSO
component_noinst =
component_install = mca_ess_pmi.la
else
component_noinst = libmca_ess_pmi.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_ess_pmi_la_SOURCES = $(sources)
mca_ess_pmi_la_LDFLAGS = -module -avoid-version $(ess_pmi_LDFLAGS)
mca_ess_pmi_la_LIBADD = $(ess_pmi_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_ess_pmi_la_SOURCES =$(sources)
libmca_ess_pmi_la_LDFLAGS = -module -avoid-version $(ess_pmi_LDFLAGS)
libmca_ess_pmi_la_LIBADD = $(ess_pmi_LIBS)

33
orte/mca/ess/pmi/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,33 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_orte_ess_pmi_PRIORITY], [10])
# MCA_ess_pmi_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_ess_pmi_CONFIG],[
AC_CONFIG_FILES([orte/mca/ess/pmi/Makefile])
# see if PMI support requested
ORTE_CHECK_PMI([ess_pmi], [ess_pmi_good=1], [ess_pmi_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$ess_pmi_good" = "1"],
[ess_pmi_WRAPPER_EXTRA_LDFLAGS="$ess_pmi_LDFLAGS"
ess_pmi_WRAPPER_EXTRA_LIBS="$ess_pmi_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([ess_pmi_CPPFLAGS])
AC_SUBST([ess_pmi_LDFLAGS])
AC_SUBST([ess_pmi_LIBS])
])dnl

19
orte/mca/ess/pmi/ess_pmi.h Обычный файл
Просмотреть файл

@ -0,0 +1,19 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_ESS_PMI_H
#define ORTE_ESS_PMI_H
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_pmi_component;
END_C_DECLS
#endif /* ORTE_ESS_PMI_H */

105
orte/mca/ess/pmi/ess_pmi_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,105 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <pmi.h>
#include "orte/util/proc_info.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/pmi/ess_pmi.h"
extern orte_ess_base_module_t orte_ess_pmi_module;
static int pmi_component_open(void);
static int pmi_component_close(void);
static int pmi_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_ess_base_component_t mca_ess_pmi_component = {
{
ORTE_ESS_BASE_VERSION_2_0_0,
/* Component name and version */
"pmi",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
pmi_component_open,
pmi_component_close,
pmi_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int pmi_component_open(void)
{
return ORTE_SUCCESS;
}
static int pmi_component_query(mca_base_module_t **module, int *priority)
{
int spawned;
PMI_BOOL initialized;
/* for now, only use PMI when direct launched */
if (NULL == orte_process_info.my_hnp_uri &&
PMI_SUCCESS == PMI_Initialized(&initialized)) {
if (PMI_TRUE != initialized) {
/* if we can't startup the PMI, we can't be used */
if (PMI_SUCCESS != PMI_Init(&spawned)) {
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}
/* if PMI is available, use it */
*priority = 100;
*module = (mca_base_module_t *)&orte_ess_pmi_module;
return ORTE_SUCCESS;
}
}
/* we can't run */
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}
static int pmi_component_close(void)
{
PMI_BOOL initialized;
/* if we weren't selected, cleanup */
if (PMI_SUCCESS == PMI_Initialized(&initialized) &&
PMI_TRUE == initialized) {
PMI_Finalize();
}
return ORTE_SUCCESS;
}

484
orte/mca/ess/pmi/ess_pmi_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,484 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <ctype.h>
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_IFADDRS_H
#include <ifaddrs.h>
#endif
#include <pmi.h>
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/paffinity/paffinity.h"
#include "opal/util/printf.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/nidmap.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/regex.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/ess/pmi/ess_pmi.h"
static int rte_init(void);
static int rte_finalize(void);
static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__;
static uint8_t proc_get_locality(orte_process_name_t *proc);
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_pidmap(opal_byte_object_t *bo);
static int update_nidmap(opal_byte_object_t *bo);
orte_ess_base_module_t orte_ess_pmi_module = {
rte_init,
rte_finalize,
rte_abort,
proc_get_locality,
proc_get_daemon,
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
NULL /* ft_event */
};
static bool app_init_complete=false;
static int pmi_maxlen=0;
static char* pmi_error(int pmi_err);
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
do { \
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \
pmi_func, pmi_error(pmi_err)); \
} while(0);
/**** MODULE FUNCTIONS ****/
static int rte_init(void)
{
int ret, i, j;
char *error = NULL, *localj;
int32_t jobfam, stepid;
char *envar;
uint64_t unique_key[2];
char *cs_env, *string_key;
char *pmi_id=NULL;
orte_nid_t *nid;
orte_jmap_t *jmap;
orte_pmap_t *pmap;
int *ranks;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog";
goto error;
}
/* get our PMI id length */
if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
error = "PMI_Get_id_length_max";
goto error;
}
pmi_id = malloc(pmi_maxlen);
if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
free(pmi_id);
error = "PMI_Get_kvs_domain_id";
goto error;
}
/* PMI is very nice to us - the domain id is an integer followed
* by a '.', followed by essentially a stepid. The first integer
* defines an overall job number. The second integer is the number of
* individual jobs we have run within that allocation. So we translate
* this as the overall job number equating to our job family, and
* the individual number equating to our local jobid
*/
jobfam = strtol(pmi_id, &localj, 10);
if (NULL == localj) {
/* hmmm - no '.', so let's just use zero */
stepid = 0;
} else {
localj++; /* step over the '.' */
stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */
}
free(pmi_id);
/* now build the jobid */
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
/* get our rank */
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
error = "could not get PMI rank";
goto error;
}
ORTE_PROC_MY_NAME->vpid = i;
/* complete definition of process name */
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* setup transport keys in case the MPI layer needs them -
* we can use the jobfam and stepid as unique keys
* because they are unique values assigned by the RM
*/
unique_key[0] = (uint64_t)jobfam;
unique_key[1] = (uint64_t)stepid;
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
asprintf(&envar, "%s=%s", cs_env, string_key);
putenv(envar);
/* cannot free the envar as that messes up our environ */
free(cs_env);
free(string_key);
/* get the number of procs */
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
error = "could not get PMI universe size";
goto error;
}
orte_process_info.num_procs = i;
/* set max procs */
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* get our app_context number */
if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) {
ORTE_PMI_ERROR(ret, "PMI_Get_appnum");
error = "could not get PMI appnum";
goto error;
}
orte_process_info.app_num = i;
/* setup the nidmap arrays - they will be filled by the modex */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
}
/* initialize our entry */
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
ORTE_ERROR_LOG(ret);
error = "orte_util_setup_local_nidmap_entries";
goto error;
}
/* correct the daemon entry on our nidmap object - note that
* each proc's nidmap will be different, but the only thing that
* matters here (since we are not routing messages) is that
* we know which procs are on the same nodes
*/
nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, 0);
nid->daemon = 0;
/* get the job map for this job */
jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0);
/* update the num procs */
jmap->num_procs = orte_process_info.num_procs;
/* set the size of the pidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
ORTE_ERROR_LOG(ret);
error = "could not set array size for pidmap";
goto error;
}
/* get my pidmap entry */
pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid);
/* get our local proc info to find our local rank */
if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) {
ORTE_PMI_ERROR(ret, "PMI_Get_clique_size");
error = "could not get PMI clique size";
goto error;
}
ranks = (int*)malloc(i * sizeof(int));
if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) {
ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks");
error = "could not get clique ranks";
goto error;
}
/* cycle thru the array until we find our rank */
for (j=0; j < i; j++) {
if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) {
pmap->local_rank = j;
pmap->node_rank = j;
break;
}
}
free(ranks);
/* ensure we pick the correct critical components */
putenv("OMPI_MCA_grpcomm=pmi");
putenv("OMPI_MCA_routed=direct");
/* now use the default procedure to finish my setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_app_setup";
goto error;
}
/* flag that we completed init */
app_init_complete = true;
return ORTE_SUCCESS;
error:
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
return ret;
}
static int rte_finalize(void)
{
int ret = ORTE_SUCCESS;
if (app_init_complete) {
/* use the default procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
}
}
/* remove the envars that we pushed into environ
* so we leave that structure intact
*/
unsetenv("OMPI_MCA_grpcomm");
unsetenv("OMPI_MCA_routed");
unsetenv("OMPI_MCA_orte_precondition_transports");
/* deconstruct my nidmap and jobmap arrays - this
* function protects itself from being called
* before things were initialized
*/
orte_util_nidmap_finalize();
return ret;
}
static void rte_abort(int error_code, bool report)
{
orte_ess_base_app_abort(error_code, report);
}
static uint8_t proc_get_locality(orte_process_name_t *proc)
{
orte_nid_t *nid;
if (NULL == (nid = orte_util_lookup_nid(proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s NID NOT FOUND",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return OPAL_PROC_NON_LOCAL;
}
if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s is LOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s is REMOTE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return OPAL_PROC_NON_LOCAL;
}
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
{
orte_nid_t *nid;
if( ORTE_JOBID_IS_DAEMON(proc->jobid) ) {
return proc->vpid;
}
if (NULL == (nid = orte_util_lookup_nid(proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s NID NOT FOUND",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return ORTE_VPID_INVALID;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s is hosted by daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
ORTE_VPID_PRINT(nid->daemon)));
return nid->daemon;
}
static char* proc_get_hostname(orte_process_name_t *proc)
{
orte_nid_t *nid;
if (NULL == (nid = orte_util_lookup_nid(proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s NID NOT FOUND",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return NULL;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s is on host %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
nid->name));
return nid->name;
}
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s NID NOT FOUND",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_LOCAL_RANK_INVALID;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s has local rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
(int)pmap->local_rank));
return pmap->local_rank;
}
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s PMAP NOT FOUND",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return ORTE_NODE_RANK_INVALID;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: proc %s has node rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
(int)pmap->node_rank));
return pmap->node_rank;
}
static int update_pidmap(opal_byte_object_t *bo)
{
int ret;
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:pmi: updating pidmap",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* build the pmap */
if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
static int update_nidmap(opal_byte_object_t *bo)
{
int rc;
/* decode the nidmap - the util will know what to do */
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* useful util */
static char* pmi_error(int pmi_err)
{
char * err_msg;
switch(pmi_err) {
case PMI_FAIL: err_msg = "Operation failed"; break;
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
case PMI_SUCCESS: err_msg = "Success"; break;
default: err_msg = "Unkown error";
}
return err_msg;
}

Просмотреть файл

@ -16,8 +16,6 @@
# $HEADER$
#
AM_CPPFLAGS = $(ess_slurmd_pmi_CPPFLAGS)
dist_pkgdata_DATA = help-ess-slurmd.txt
sources = \
@ -40,10 +38,10 @@ endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_ess_slurmd_la_SOURCES = $(sources)
mca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS) $(ess_slurmd_pmi_LDFLAGS)
mca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS) $(ess_slurmd_pmi_LIBS)
mca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS)
mca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_ess_slurmd_la_SOURCES =$(sources)
libmca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS) $(ess_slurmd_pmi_LDFLAGS)
libmca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS) $(ess_slurmd_pmi_LIBS)
libmca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS)
libmca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS)

Просмотреть файл

@ -39,20 +39,4 @@ AC_DEFUN([MCA_orte_ess_slurmd_CONFIG],[
AC_SUBST([ess_slurmd_LDFLAGS])
AC_SUBST([ess_slurmd_LIBS])
# see if PMI support also requested
ORTE_CHECK_PMI([ess_slurmd_pmi], [ess_slurmd_pmi_good=1], [ess_slurmd_pmi_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$ess_slurmd_pmi_good" = 1],
[ess_slurmd_WRAPPER_EXTRA_LDFLAGS="$ess_slurmd_LDFLAGS $ess_slurmd_pmi_LDFLAGS"
ess_slurmd_WRAPPER_EXTRA_LIBS="$ess_slurmd_LIBS $ess_slurmd_pmi_LIBS"
],
[])
# set build flags to use in makefile
AC_SUBST([ess_slurmd_pmi_CPPFLAGS])
AC_SUBST([ess_slurmd_pmi_LDFLAGS])
AC_SUBST([ess_slurmd_pmi_LIBS])
])dnl

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,10 +26,6 @@
#include "orte_config.h"
#include "orte/constants.h"
#if WANT_PMI_SUPPORT
#include <pmi.h>
#endif
#include "orte/util/proc_info.h"
#include "orte/mca/ess/ess.h"
@ -82,17 +79,6 @@ int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority)
NULL != getenv("SLURM_JOBID") &&
NULL != getenv("SLURM_STEPID") &&
NULL == orte_process_info.my_hnp_uri) {
#if WANT_PMI_SUPPORT
{
int spawned;
/* if we can't startup the PMI, we can't be used */
if (PMI_SUCCESS != PMI_Init(&spawned)) {
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}
}
#endif
*priority = 30;
*module = (mca_base_module_t *)&orte_ess_slurmd_module;
return ORTE_SUCCESS;

Просмотреть файл

@ -34,13 +34,9 @@
#ifdef HAVE_IFADDRS_H
#include <ifaddrs.h>
#endif
#if WANT_PMI_SUPPORT
#include <pmi.h>
#endif
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/opal_sos.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/class/opal_pointer_array.h"
@ -130,7 +126,6 @@ static int rte_init(void)
goto error;
}
/* Only application procs can use this module. Since we
* were directly launched by srun, we need to bootstrap
* our own global info so we can startup. Srun will have
@ -179,63 +174,56 @@ static int rte_init(void)
free(cs_env);
free(string_key);
#if WANT_PMI_SUPPORT
/* get our rank from PMI */
if (PMI_SUCCESS != PMI_Get_rank(&i)) {
error = "PMI_Get_rank failed";
/* get my local nodeid */
if (NULL == (envar = getenv("SLURM_NODEID"))) {
error = "could not get SLURM_NODEID";
goto error;
}
ORTE_PROC_MY_NAME->vpid = i;
#else
nodeid = strtol(envar, NULL, 10);
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = nodeid;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
/* get the node list */
if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) {
error = "could not get SLURM_STEP_NODELIST";
goto error;
}
/* break that down into a list of nodes */
if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) {
error = "could not parse node list";
goto error;
}
num_nodes = opal_argv_count(nodes);
orte_process_info.num_nodes = num_nodes;
/* setup the nidmap arrays */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
}
/* set the size of the nidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) {
error = "could not set pointer array size for nidmap";
goto error;
}
/* get the slurm procid - this will be our vpid */
if (NULL == (envar = getenv("SLURM_PROCID"))) {
error = "could not get SLURM_PROCID";
goto error;
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
#endif
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* get our local rank */
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
error = "could not get SLURM_LOCALID";
goto error;
}
local_rank = strtol(envar, NULL, 10);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s local rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
local_rank));
#if WANT_PMI_SUPPORT
if (PMI_SUCCESS != PMI_Get_universe_size(&i)) {
error = "PMI_Get_universe_size failed";
goto error;
}
orte_process_info.num_procs = i;
#else
/* get the number of procs in this job */
if (NULL == (envar = getenv("SLURM_STEP_NUM_TASKS"))) {
error = "could not get SLURM_STEP_NUM_TASKS";
goto error;
}
orte_process_info.num_procs = strtol(envar, NULL, 10);
#endif
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
#if WANT_PMI_SUPPORT
if (PMI_SUCCESS != PMI_Get_appnum(&i)) {
error = "PMI_Get_appnum failed";
goto error;
}
orte_process_info.app_num = i;
#else
/* set the app_num so that MPI attributes get set correctly */
orte_process_info.app_num = 1;
#endif
/* if this is SLURM 2.0 or above, get our port
* assignments for use in the OOB
*/
@ -250,18 +238,8 @@ static int rte_init(void)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
envar));
}
/* get my local nodeid */
if (NULL == (envar = getenv("SLURM_NODEID"))) {
error = "could not get SLURM_NODEID";
goto error;
}
nodeid = strtol(envar, NULL, 10);
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = nodeid;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
/* get the number of ppn */
/* get the number of tasks/node */
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {
error = "could not get SLURM_STEP_TASKS_PER_NODE";
goto error;
@ -278,19 +256,6 @@ static int rte_init(void)
cpus_per_task = 1;
}
/* get the node list */
if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) {
error = "could not get SLURM_STEP_NODELIST";
goto error;
}
/* break that down into a list of nodes */
if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) {
error = "could not parse node list";
goto error;
}
num_nodes = opal_argv_count(nodes);
orte_process_info.num_nodes = num_nodes;
/* compute the ppn */
if (ORTE_SUCCESS != (ret = orte_regex_extract_ppn(num_nodes, tasks_per_node, &ppn))) {
error = "could not determine #procs on each node";
@ -314,24 +279,7 @@ static int rte_init(void)
error = "distribution/mapping mode not supported";
goto error;
}
#if 0
SLURM_DIST_PLANESIZE=0
SLURM_DIST_LLLP=
#endif
/* setup the nidmap arrays */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
}
/* set the size of the nidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) {
error = "could not set pointer array size for nidmap";
goto error;
}
/* construct the nidmap */
for (i=0; i < num_nodes; i++) {
node = OBJ_NEW(orte_nid_t);
@ -352,7 +300,7 @@ static int rte_init(void)
/* set the size of the pidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
ORTE_ERROR_LOG(ret);
error = "could not set value array size for pidmap";
error = "could not set array size for pidmap";
goto error;
}
@ -415,14 +363,29 @@ static int rte_init(void)
}
}
free(ppn);
/* ensure we pick the correct critical components */
#if WANT_PMI_SUPPORT
putenv("OMPI_MCA_grpcomm=pmi");
#else
putenv("OMPI_MCA_grpcomm=hier");
#endif
putenv("OMPI_MCA_routed=direct");
/* complete definition of process name */
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* get our local rank */
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
error = "could not get SLURM_LOCALID";
goto error;
}
local_rank = strtol(envar, NULL, 10);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s local rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
local_rank));
/* set max procs */
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* now use the default procedure to finish my setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
@ -466,14 +429,13 @@ static int rte_finalize(void)
* before things were initialized
*/
orte_util_nidmap_finalize();
return ret;
}
static void rte_abort(int error_code, bool report)
{
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == OPAL_SOS_GET_ERROR_CODE(error_code) &&
slurm20) {
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
orte_ess_base_app_abort(108, false);
} else {

Просмотреть файл

@ -16,6 +16,7 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "grpcomm_pmi.h"
@ -49,20 +50,44 @@ int orte_grpcomm_pmi_open(void)
int orte_grpcomm_pmi_close(void)
{
PMI_BOOL initialized;
/* if we weren't selected, cleanup if necessary */
if (PMI_SUCCESS == PMI_Initialized(&initialized) &&
PMI_TRUE == initialized) {
PMI_Finalize();
}
return ORTE_SUCCESS;
}
int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
{
int spawned;
/* if we can't startup the PMI, we can't be used */
if (PMI_SUCCESS != PMI_Init(&spawned)) {
*priority = -1;
*module = NULL;
return ORTE_ERROR;
PMI_BOOL initialized;
/* for now, only use PMI when direct launched */
if (!ORTE_PROC_IS_HNP &&
NULL == orte_process_info.my_hnp_uri &&
PMI_SUCCESS == PMI_Initialized(&initialized)) {
/* if we aren't already initialized, then try */
if (PMI_TRUE != initialized) {
/* if we can't startup the PMI, we can't be used */
if (PMI_SUCCESS != PMI_Init(&spawned)) {
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}
}
/* if we were able to startup PMI, or it was already
* running, then use us
*/
*priority = 100;
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
return ORTE_SUCCESS;
}
/* we are a default, so set a low priority so we can be overridden */
*priority = 1;
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
return ORTE_SUCCESS;
/* we can't run */
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -18,6 +18,7 @@
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
@ -58,53 +59,39 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = {
purge_proc_attrs
};
/* useful util */
static char* orte_pmi_error(int pmi_err) {
char * err_msg;
switch(pmi_err) {
case PMI_FAIL: err_msg = "Operation failed"; break;
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
case PMI_SUCCESS: err_msg = "Success"; break;
default: err_msg = "Unkown error";
}
return err_msg;
}
static char* pmi_encode(const void *val, size_t vallen);
static void* pmi_decode(unsigned char *val, size_t *retlen);
/* Local variables */
static char *pmi_kvs_name = NULL;
static int pmi_vallen_max = -1;
static int pmi_encode(const void *val, size_t vallen);
static void* pmi_decode(size_t *retlen);
static char* pmi_error(int pmi_err);
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
do { \
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \
pmi_func, orte_pmi_error(pmi_err)); \
pmi_func, pmi_error(pmi_err)); \
} while(0);
static int setup_pmi(void);
static int setup_key(const orte_process_name_t *name, const char *key);
/* Local variables */
static char *pmi_kvs_name = NULL;
static char *pmi_kvs_key = NULL;
static char *pmi_attr_val = NULL;
static int pmi_vallen_max = -1;
static int pmi_keylen_max = -1;
/**
* Initialize the module
*/
static int init(void)
{
int rc;
if (NULL == pmi_kvs_name) {
if (ORTE_SUCCESS != (rc = setup_pmi())) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;
}
@ -113,6 +100,18 @@ static int init(void)
*/
static void finalize(void)
{
if (NULL != pmi_kvs_name) {
free(pmi_kvs_name);
pmi_kvs_name = NULL;
}
if (NULL != pmi_kvs_key) {
free(pmi_kvs_key);
pmi_kvs_key = NULL;
}
if (NULL != pmi_attr_val) {
free(pmi_attr_val);
pmi_attr_val = NULL;
}
return;
}
@ -177,35 +176,18 @@ static int pmi_allgather_list(opal_list_t *names,
static int pmi_set_proc_attr(const char* attr_name,
const void *buffer, size_t size)
{
char *attr, *attrval;
int rc;
if (NULL == pmi_kvs_name) {
int max_length;
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_Get_value_length_max");
return ORTE_ERROR;
}
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
return ORTE_ERROR;
}
pmi_kvs_name = malloc(max_length);
if (NULL == pmi_kvs_name) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_my_name");
return ORTE_ERROR;
}
if (ORTE_SUCCESS != (rc = setup_pmi())) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (0 > asprintf(&attr, "%s-%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attr_name)) {
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, attr_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
@ -213,25 +195,17 @@ static int pmi_set_proc_attr(const char* attr_name,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attr_name,
(unsigned long)size, pmi_kvs_name));
attrval = pmi_encode(buffer, size);
if (NULL == attrval) {
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = pmi_encode(buffer, size))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (strlen(attrval) > (size_t)pmi_vallen_max) {
opal_output(0, "pmi_proc_set_attr: attribute length is too long\n");
return ORTE_ERROR;
}
rc = PMI_KVS_Put(pmi_kvs_name, attr, attrval);
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, pmi_attr_val);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
return ORTE_ERROR;
}
free(attr);
free(attrval);
return ORTE_SUCCESS;
}
@ -239,11 +213,17 @@ static int pmi_get_proc_attr(const orte_process_name_t name,
const char* attr_name,
void **buffer, size_t *size)
{
char *attrval, *attr;
int rc;
/* set default */
*size = 0;
*buffer = NULL;
if (NULL == pmi_kvs_name) {
return ORTE_ERR_UNREACH;
if (ORTE_SUCCESS != (rc = setup_pmi())) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
@ -251,70 +231,112 @@ static int pmi_get_proc_attr(const orte_process_name_t name,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attr_name,
ORTE_NAME_PRINT(&name), pmi_kvs_name));
attrval = malloc(pmi_vallen_max);
if (NULL == attrval) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > asprintf(&attr, "%s-%s", ORTE_NAME_PRINT(&name), attr_name)) {
free(attrval);
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = setup_key(&name, attr_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Get(pmi_kvs_name, attr, attrval, pmi_vallen_max);
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
free(attrval);
free(attr);
return ORTE_ERROR;
}
*buffer = pmi_decode((unsigned char *)attrval, size);
free(attrval);
free(attr);
*buffer = pmi_decode(size);
if (NULL == *buffer) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:pmi: got attr %s of size %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
attr_name, (unsigned long)(*size)));
if (NULL == buffer) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
/*** MODEX SECTION ***/
static int modex(opal_list_t *procs)
{
int rc;
char *rml_uri, *attr;
int rc, i;
char *rml_uri, val[64];
orte_vpid_t v;
orte_process_name_t name;
orte_jmap_t *jmap;
orte_nid_t *nid, *loc;
orte_pmap_t *pmap;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:pmi: modex entered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (NULL == pmi_kvs_name) {
if (ORTE_SUCCESS != (rc = setup_pmi())) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* provide our hostname so others can know our location */
if (strlen(orte_process_info.nodename) > (size_t)pmi_vallen_max) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
}
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "HOSTNAME"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, orte_process_info.nodename);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
return ORTE_ERROR;
}
/* add our oob endpoint info so that oob communications
* can be supported
*/
rml_uri = orte_rml.get_contact_info();
if (strlen(rml_uri) > (size_t)pmi_vallen_max) {
opal_output(0, "grpcomm:pmi: RML uri length is too long\n");
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return ORTE_ERROR;
}
if (0 > asprintf(&attr, "%s-RMLURI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))) {
free(rml_uri);
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "RMLURI"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Put(pmi_kvs_name, attr, rml_uri);
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, rml_uri);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
free(rml_uri);
free(attr);
return ORTE_ERROR;
}
free(rml_uri);
free(attr);
/* get the job map for this job */
jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0);
/* get my pidmap entry */
pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid);
/* add our locality info */
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "LOCALRANK"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
snprintf(val, 64, "%lu", (unsigned long)pmap->local_rank);
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
return ORTE_ERROR;
}
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "NODERANK"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
snprintf(val, 64, "%lu", (unsigned long)pmap->node_rank);
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
return ORTE_ERROR;
}
/* commit our modex info */
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) {
@ -327,42 +349,108 @@ static int modex(opal_list_t *procs)
return rc;
}
/* harvest the oob endpoint info for all other procs
* in our job so oob wireup can be completed
/* harvest the oob endpoint info and hostname for all other procs
* in our job so oob wireup can be completed and we
* can setup their nidmap/pidmap
*/
rml_uri = malloc(pmi_vallen_max);
if (NULL == rml_uri) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
name.jobid = ORTE_PROC_MY_NAME->jobid;
orte_process_info.num_nodes = 1; /* have to account for mine! */
for (v=0; v < orte_process_info.num_procs; v++) {
if (v == ORTE_PROC_MY_NAME->vpid) {
continue;
}
name.vpid = v;
if (0 > asprintf(&attr, "%s-RMLURI", ORTE_NAME_PRINT(&name))) {
free(rml_uri);
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = setup_key(&name, "RMLURI"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Get(pmi_kvs_name, attr, rml_uri, pmi_vallen_max);
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
free(rml_uri);
free(attr);
return ORTE_ERROR;
}
free(attr);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:pmi: proc %s oob endpoint %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), rml_uri));
ORTE_NAME_PRINT(&name), pmi_attr_val));
/* set the contact info into the hash table */
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
free(rml_uri);
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(pmi_attr_val))) {
return rc;
}
if (ORTE_SUCCESS != (rc = setup_key(&name, "HOSTNAME"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
return ORTE_ERROR;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:pmi: proc %s location %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), pmi_attr_val));
/* see if this node is already in nidmap */
loc = NULL;
for (i=0; i < orte_nidmap.size; i++) {
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
continue;
}
if (0 == strcmp(pmi_attr_val, nid->name)) {
/* found it */
loc = nid;
break;
}
}
if (NULL == loc) {
/* new node - save it */
loc = OBJ_NEW(orte_nid_t);
loc->name = strdup(pmi_attr_val);
loc->index = opal_pointer_array_add(&orte_nidmap, loc);
loc->daemon = loc->index;
/* keep track */
orte_process_info.num_nodes++;
}
/* see if this proc is already in the pidmap */
if (NULL == opal_pointer_array_get_item(&jmap->pmap, v)) {
/* nope - add it */
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = loc->index;
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, v, pmap))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* get the proc's locality info */
if (ORTE_SUCCESS != (rc = setup_key(&name, "LOCALRANK"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
return ORTE_ERROR;
}
pmap->local_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10);
if (ORTE_SUCCESS != (rc = setup_key(&name, "NODERANK"))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
return ORTE_ERROR;
}
pmap->node_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:pmi: proc %s lrank %u nrank %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name),
(unsigned int)pmap->local_rank,
(unsigned int)pmap->node_rank));
}
free(rml_uri);
/* cycle thru the array of our peers and assign local and node ranks */
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
"%s grpcomm:pmi: modex completed",
@ -380,31 +468,33 @@ static int purge_proc_attrs(void)
/* PMI only supports strings. For now, do a simple base16
* encoding. Should do something smarter, both with the
* algorith used and its implementation. */
static char* pmi_encode(const void *val, size_t vallen) {
static int pmi_encode(const void *val, size_t vallen) {
static unsigned char encodings[] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
size_t i;
unsigned char *ret = malloc(vallen *2 +1);
if (NULL == ret) {
return NULL;
/* check for size */
if ((size_t)pmi_vallen_max < ((vallen * 2) + 1)) {
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
}
for (i = 0; i < vallen; i++) {
ret[2 * i] = encodings[((unsigned char *)val)[i] & 0xf];
ret[2 * i + 1] = encodings[((unsigned char *)val)[i] >> 4];
pmi_attr_val[2 * i] = encodings[((unsigned char *)val)[i] & 0xf];
pmi_attr_val[2 * i + 1] = encodings[((unsigned char *)val)[i] >> 4];
}
ret[vallen *2] = '\0';
return (char *)ret;
pmi_attr_val[vallen * 2] = '\0';
return ORTE_SUCCESS;
}
static void* pmi_decode(unsigned char *val, size_t *retlen) {
unsigned char *ret;
static void* pmi_decode(size_t *retlen) {
unsigned char *ret, *val;
size_t i;
*retlen = strlen((char*)val)/2;
*retlen = strlen(pmi_attr_val)/2;
ret = malloc(*retlen);
if (NULL == ret) {
return ret;
}
val = (unsigned char*)pmi_attr_val;
for (i = 0; i < *retlen; i++) {
if (*val >= '0' && *val <= '9') {
ret[i] = *val - '0';
@ -422,3 +512,77 @@ static void* pmi_decode(unsigned char *val, size_t *retlen) {
return ret;
}
/* useful util */
static char* pmi_error(int pmi_err)
{
char * err_msg;
switch(pmi_err) {
case PMI_FAIL: err_msg = "Operation failed"; break;
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
case PMI_SUCCESS: err_msg = "Success"; break;
default: err_msg = "Unkown error";
}
return err_msg;
}
static int setup_pmi(void)
{
int max_length, rc;
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_Get_value_length_max");
return ORTE_ERROR;
}
pmi_attr_val = malloc(pmi_vallen_max);
if (NULL == pmi_attr_val) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
return ORTE_ERROR;
}
pmi_kvs_name = malloc(max_length);
if (NULL == pmi_kvs_name) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
if (PMI_SUCCESS != rc) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_my_name");
return ORTE_ERROR;
}
if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) {
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max");
return ORTE_ERROR;
}
pmi_kvs_key = malloc(pmi_keylen_max);
return ORTE_SUCCESS;
}
static int setup_key(const orte_process_name_t *name, const char *key)
{
if (pmi_keylen_max <= snprintf(pmi_kvs_key, pmi_keylen_max,
"%s-%s", ORTE_NAME_PRINT(name), key)) {
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -187,9 +187,6 @@ int orte_util_setup_local_nidmap_entries(void)
pmap->local_rank = 0;
pmap->node_rank = 0;
node->index = opal_pointer_array_add(&orte_nidmap, node);
/* value array copies values, so everything must be set before
* calling the set_item function
*/
pmap->node = node->index;
opal_pointer_array_set_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid, pmap);