Cray's PMI implementation is quite different from slurm's - they extended PMI-1 by adding some, but not all, of the PMI-2 APIs. So you can't just switch to using PMI-2 functions as it isn't a complete implementation. Instead, you have to selectively figure out which ones they have in PMI-2, and use any missing ones from PMI-1. What fun.
Modify the configure logic and the PMI components to accommodate Cray's approach. Refactor the PMI error reporting code so it resides in only one place. Cray actually decided -not- to define the PMI-2 error codes, so we have to use the PMI-1 codes instead. More fun. This commit was SVN r25348.
Этот коммит содержится в:
родитель
e2adc8fa3a
Коммит
3e72fccacf
@ -11,24 +11,19 @@
|
||||
#include "ompi/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "ompi/info/info.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "ompi/mca/pubsub/base/base.h"
|
||||
#include "pubsub_pmi.h"
|
||||
|
||||
static char* pmi_error(int pmi_err);
|
||||
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
|
||||
do { \
|
||||
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
pmi_func, pmi_error(pmi_err)); \
|
||||
} while(0);
|
||||
|
||||
/*
|
||||
* Init the module
|
||||
*/
|
||||
@ -44,11 +39,17 @@ static int publish ( char *service_name, ompi_info_t *info, char *port_name )
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
if (PMI2_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI2_Nameserv_publish");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Publish_name");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -57,11 +58,19 @@ static char* lookup ( char *service_name, ompi_info_t *info )
|
||||
char *port=NULL;
|
||||
int rc;
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */
|
||||
if (PMI2_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI2_Nameserv_lookup");
|
||||
free(port);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_Lookup_name");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif
|
||||
return port;
|
||||
}
|
||||
|
||||
@ -71,10 +80,17 @@ static int unpublish ( char *service_name, ompi_info_t *info )
|
||||
{
|
||||
int rc;
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
if (PMI_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI2_Nameserv_unpublish");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_Unpublish_name(service_name))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_Unpublish_name");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;;
|
||||
}
|
||||
|
||||
@ -97,34 +113,3 @@ ompi_pubsub_base_module_t ompi_pubsub_pmi_module = {
|
||||
lookup,
|
||||
finalize
|
||||
};
|
||||
|
||||
|
||||
/* useful util */
|
||||
static char* pmi_error(int pmi_err)
|
||||
{
|
||||
char * err_msg;
|
||||
|
||||
switch(pmi_err) {
|
||||
case PMI_FAIL: err_msg = "Operation failed"; break;
|
||||
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
|
||||
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
|
||||
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
|
||||
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
|
||||
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
|
||||
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
|
||||
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
|
||||
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
|
||||
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
|
||||
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
|
||||
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
|
||||
#if defined(PMI_ERR_INVALID_KVS)
|
||||
/* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */
|
||||
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
|
||||
#endif
|
||||
case PMI_SUCCESS: err_msg = "Success"; break;
|
||||
default: err_msg = "Unkown error";
|
||||
}
|
||||
return err_msg;
|
||||
}
|
||||
|
@ -11,6 +11,9 @@
|
||||
#include "ompi/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
@ -46,39 +49,62 @@ static int pubsub_pmi_component_open(void)
|
||||
|
||||
static int pubsub_pmi_component_close(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
if (PMI2_Initialized()) {
|
||||
PMI2_Finalize();
|
||||
}
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* if we weren't selected, cleanup if necessary */
|
||||
/* if we weren't selected, cleanup */
|
||||
if (PMI_SUCCESS == PMI_Initialized(&initialized) &&
|
||||
PMI_TRUE == initialized) {
|
||||
PMI_Finalize();
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static bool pmi_startup(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
int spawned, size, rank, appnum;
|
||||
|
||||
if (PMI2_Initialized()) {
|
||||
/* already initialized */
|
||||
return true;
|
||||
}
|
||||
/* if we can't startup PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) {
|
||||
return false;
|
||||
}
|
||||
/* ignore the info - we'll pick it up elsewhere */
|
||||
return true;
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
if (PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
if (PMI_TRUE != initialized) {
|
||||
if (PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int spawned;
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* for now, only use PMI when direct launched */
|
||||
if (NULL == orte_process_info.my_hnp_uri &&
|
||||
PMI_SUCCESS == PMI_Initialized(&initialized)) {
|
||||
/* if we aren't already initialized, then try */
|
||||
if (PMI_TRUE != initialized) {
|
||||
/* if we can't startup the PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI_Init(&spawned)) {
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
/* if we were able to startup PMI, or it was already
|
||||
* running, then use us
|
||||
*/
|
||||
pmi_startup()) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&ompi_pubsub_pmi_module;
|
||||
return OMPI_SUCCESS;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* we can't run */
|
||||
|
@ -27,8 +27,13 @@ AC_DEFUN([ORTE_CHECK_PMI],[
|
||||
[AC_HELP_STRING([--with-pmi],
|
||||
[Build PMI support (default: no)])],
|
||||
[], with_pmi=no)
|
||||
AC_ARG_WITH([cray-pmi2-ext],
|
||||
[AC_HELP_STRING([--with-cray-pmi-ext],
|
||||
[Include Cray PMI2 extensions (default: no)])],
|
||||
[], with_cray_pmi2_ext=no)
|
||||
|
||||
orte_enable_pmi=0
|
||||
orte_use_cray_pmi2_ext=0
|
||||
|
||||
# save flags
|
||||
orte_check_pmi_$1_save_CPPFLAGS="$CPPFLAGS"
|
||||
@ -42,7 +47,8 @@ AC_DEFUN([ORTE_CHECK_PMI],[
|
||||
AC_MSG_CHECKING([if user requested PMI support])
|
||||
AS_IF([test "$with_pmi" = "no"],
|
||||
[AC_MSG_RESULT([no])
|
||||
orte_want_pmi_support=no],
|
||||
orte_want_pmi_support=no
|
||||
orte_use_cray_pmi2_ext=0],
|
||||
[AC_MSG_RESULT([yes])
|
||||
orte_want_pmi_support=yes
|
||||
AC_MSG_CHECKING([if PMI support installed])
|
||||
@ -53,13 +59,13 @@ AC_DEFUN([ORTE_CHECK_PMI],[
|
||||
[AS_IF([test -d "$with_pmi/lib64"],
|
||||
[orte_check_pmi_$1_LDFLAGS="-L$with_pmi/lib64"],
|
||||
[orte_check_pmi_$1_LDFLAGS="-L$with_pmi/lib"])
|
||||
AS_IF([test -f "$with_pmi/include/pmi.h"],
|
||||
[orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include"],
|
||||
[AS_IF([test -f "$with_pmi/include/slurm/pmi.h"],
|
||||
[orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm"],
|
||||
[])])],
|
||||
AS_IF([test -f "$with_pmi/include/pmi.h"],
|
||||
[orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include"],
|
||||
[AS_IF([test -f "$with_pmi/include/slurm/pmi.h"],
|
||||
[orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm"])])],
|
||||
[AS_IF([test -f "/usr/include/slurm/pmi.h"],
|
||||
[orte_check_pmi_$1_CPPFLAGS="-I/usr/include/slurm"])])
|
||||
|
||||
LDFLAGS="$LDFLAGS $orte_check_pmi_$1_LDFLAGS"
|
||||
CPPFLAGS="$CPPFLAGS $orte_check_pmi_$1_CPPFLAGS"
|
||||
LIBS="$LIBS -lpmi"
|
||||
@ -78,15 +84,35 @@ AC_DEFUN([ORTE_CHECK_PMI],[
|
||||
[AC_MSG_RESULT([no])
|
||||
AC_MSG_WARN([PMI support requested (via --with-pmi) but not found.])
|
||||
AC_MSG_ERROR([Aborting.])
|
||||
$3])])
|
||||
$3])
|
||||
|
||||
# restore flags
|
||||
CPPFLAGS="$orte_check_pmi_$1_save_CPPFLAGS"
|
||||
AC_MSG_CHECKING([if user requested Cray PMI2 extensions])
|
||||
AS_IF([test "$with_cray_pmi2_ext" = "no"],
|
||||
[AC_MSG_RESULT([no])
|
||||
orte_use_pmi2_ext=0],
|
||||
[AC_MSG_RESULT([yes])
|
||||
# check to see if pmi2.h header is present. if it is, then we
|
||||
# will use some of the functions in it.
|
||||
AC_MSG_CHECKING([if PMI2 extensions installed])
|
||||
AS_IF([test -f "$with_pmi/include/pmi2.h"],
|
||||
[orte_use_pmi2_ext=1],
|
||||
[AC_MSG_RESULT([no])
|
||||
AC_MSG_WARN([PMI2 extensions requested (via --with-cray-pmi2-ext) but not found.])
|
||||
AC_MSG_ERROR([Aborting.])
|
||||
orte_use_pmi2_ext=0
|
||||
orte_enable_pmi=0
|
||||
$3])])])
|
||||
|
||||
# restore flags - have to add CPPFLAGS so base functions can find pmi.h
|
||||
CPPFLAGS="$orte_check_pmi_$1_save_CPPFLAGS $orte_check_pmi_$1_CPPFLAGS"
|
||||
LDFLAGS="$orte_check_pmi_$1_save_LDFLAGS"
|
||||
LIBS="$orte_check_pmi_$1_save_LIBS"
|
||||
|
||||
AC_DEFINE_UNQUOTED([WANT_PMI_SUPPORT],
|
||||
[$orte_enable_pmi],
|
||||
[Whether we want PMI support])
|
||||
AC_DEFINE_UNQUOTED([WANT_CRAY_PMI2_EXT],
|
||||
[$orte_use_pmi2_ext],
|
||||
[Whether we want to use Cray PMI2 extensions])
|
||||
AM_CONDITIONAL(WANT_PMI_SUPPORT, [test "$orte_enable_pmi" = 1])
|
||||
])
|
||||
|
@ -45,6 +45,10 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#if WANT_PMI_SUPPORT
|
||||
#include <pmi.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
@ -209,6 +213,39 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
}
|
||||
}
|
||||
|
||||
#if WANT_PMI_SUPPORT
|
||||
/* useful util */
|
||||
char* orte_errmgr_base_pmi_error(int pmi_err)
|
||||
{
|
||||
char * err_msg;
|
||||
|
||||
switch(pmi_err) {
|
||||
case PMI_FAIL: err_msg = "Operation failed"; break;
|
||||
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
|
||||
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
|
||||
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
|
||||
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
|
||||
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
|
||||
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
|
||||
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
|
||||
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
|
||||
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
|
||||
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
|
||||
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
|
||||
#if defined(PMI_ERR_INVALID_KVS)
|
||||
/* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */
|
||||
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
|
||||
#endif
|
||||
case PMI_SUCCESS: err_msg = "Success"; break;
|
||||
default: err_msg = "Unkown error";
|
||||
}
|
||||
return err_msg;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
@ -150,6 +150,16 @@ OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
||||
#define ORTE_ERROR_LOG(n) \
|
||||
orte_errmgr.log(n, __FILE__, __LINE__);
|
||||
|
||||
#if WANT_PMI_SUPPORT
|
||||
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
|
||||
do { \
|
||||
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
pmi_func, orte_errmgr_base_pmi_error(pmi_err)); \
|
||||
} while(0);
|
||||
OPAL_DECLSPEC char* orte_errmgr_base_pmi_error(int pmi_err);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Framework Interfaces
|
||||
|
@ -1,5 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All
|
||||
* rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,6 +19,9 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
@ -60,28 +65,46 @@ static int pmi_component_open(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static bool pmi_startup(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
int spawned, size, rank, appnum;
|
||||
|
||||
if (PMI2_Initialized()) {
|
||||
/* already initialized */
|
||||
return true;
|
||||
}
|
||||
/* if we can't startup PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) {
|
||||
return false;
|
||||
}
|
||||
/* ignore the info - we'll pick it up elsewhere */
|
||||
return true;
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
if (PMI_SUCCESS != PMI_Initialized(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
if (PMI_TRUE != initialized) {
|
||||
if (PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int spawned;
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* for now, only use PMI when direct launched */
|
||||
if (!ORTE_PROC_IS_HNP &&
|
||||
NULL == orte_process_info.my_hnp_uri &&
|
||||
PMI_SUCCESS == PMI_Initialized(&initialized)) {
|
||||
if (PMI_TRUE != initialized) {
|
||||
/* if we can't startup the PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI_Init(&spawned)) {
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_ess_pmi_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
pmi_startup()) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_ess_pmi_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* we can't run */
|
||||
@ -93,6 +116,11 @@ static int pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
|
||||
static int pmi_component_close(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
if (PMI2_Initialized()) {
|
||||
PMI2_Finalize();
|
||||
}
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* if we weren't selected, cleanup */
|
||||
@ -100,6 +128,7 @@ static int pmi_component_close(void)
|
||||
PMI_TRUE == initialized) {
|
||||
PMI_Finalize();
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -80,14 +80,6 @@ orte_ess_base_module_t orte_ess_pmi_module = {
|
||||
|
||||
static bool app_init_complete=false;
|
||||
static int pmi_maxlen=0;
|
||||
static char* pmi_error(int pmi_err);
|
||||
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
|
||||
do { \
|
||||
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
pmi_func, pmi_error(pmi_err)); \
|
||||
} while(0);
|
||||
|
||||
/**** MODULE FUNCTIONS ****/
|
||||
|
||||
@ -307,33 +299,3 @@ static void rte_abort(int error_code, bool report)
|
||||
{
|
||||
orte_ess_base_app_abort(error_code, report);
|
||||
}
|
||||
|
||||
/* useful util */
|
||||
static char* pmi_error(int pmi_err)
|
||||
{
|
||||
char * err_msg;
|
||||
|
||||
switch(pmi_err) {
|
||||
case PMI_FAIL: err_msg = "Operation failed"; break;
|
||||
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
|
||||
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
|
||||
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
|
||||
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
|
||||
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
|
||||
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
|
||||
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
|
||||
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
|
||||
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
|
||||
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
|
||||
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
|
||||
#if defined(PMI_ERR_INVALID_KVS)
|
||||
/* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */
|
||||
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
|
||||
#endif
|
||||
case PMI_SUCCESS: err_msg = "Success"; break;
|
||||
default: err_msg = "Unkown error";
|
||||
}
|
||||
return err_msg;
|
||||
}
|
||||
|
@ -1,17 +1,22 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All
|
||||
* rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <pmi.h>
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
@ -50,37 +55,60 @@ int orte_grpcomm_pmi_open(void)
|
||||
|
||||
int orte_grpcomm_pmi_close(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
if (PMI2_Initialized()) {
|
||||
PMI2_Finalize();
|
||||
}
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* if we weren't selected, cleanup if necessary */
|
||||
/* if we weren't selected, cleanup */
|
||||
if (PMI_SUCCESS == PMI_Initialized(&initialized) &&
|
||||
PMI_TRUE == initialized) {
|
||||
PMI_Finalize();
|
||||
}
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static bool pmi_startup(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
int spawned, size, rank, appnum;
|
||||
|
||||
if (PMI2_Initialized()) {
|
||||
/* already initialized */
|
||||
return true;
|
||||
}
|
||||
/* if we can't startup PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) {
|
||||
return false;
|
||||
}
|
||||
/* ignore the info - we'll pick it up elsewhere */
|
||||
return true;
|
||||
#else
|
||||
PMI_BOOL initialized;
|
||||
|
||||
if (PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
if (PMI_TRUE != initialized) {
|
||||
if (PMI_SUCCESS != PMI_Init(&initialized)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int spawned;
|
||||
PMI_BOOL initialized;
|
||||
|
||||
/* for now, only use PMI when direct launched */
|
||||
if (!ORTE_PROC_IS_HNP &&
|
||||
NULL == orte_process_info.my_hnp_uri &&
|
||||
PMI_SUCCESS == PMI_Initialized(&initialized)) {
|
||||
/* if we aren't already initialized, then try */
|
||||
if (PMI_TRUE != initialized) {
|
||||
/* if we can't startup the PMI, we can't be used */
|
||||
if (PMI_SUCCESS != PMI_Init(&spawned)) {
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
/* if we were able to startup PMI, or it was already
|
||||
* running, then use us
|
||||
*/
|
||||
pmi_startup()) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -2,6 +2,8 @@
|
||||
* Copyright (c) 2007 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All
|
||||
* rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -15,6 +17,9 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <pmi.h>
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
#include <pmi2.h>
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
@ -62,14 +67,6 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = {
|
||||
|
||||
static int pmi_encode(const void *val, size_t vallen);
|
||||
static void* pmi_decode(size_t *retlen);
|
||||
static char* pmi_error(int pmi_err);
|
||||
#define ORTE_PMI_ERROR(pmi_err, pmi_func) \
|
||||
do { \
|
||||
opal_output(0, "%s[%s:%d:%s] %s: %s\n", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
pmi_func, pmi_error(pmi_err)); \
|
||||
} while(0);
|
||||
static int setup_pmi(void);
|
||||
static int setup_key(const orte_process_name_t *name, const char *key);
|
||||
|
||||
@ -80,6 +77,45 @@ static char *pmi_attr_val = NULL;
|
||||
static int pmi_vallen_max = -1;
|
||||
static int pmi_keylen_max = -1;
|
||||
|
||||
/* Because Cray uses PMI2 extensions for some, but not all,
|
||||
* PMI functions, we define a set of wrappers for those
|
||||
* common functions we will use
|
||||
*/
|
||||
static int kvs_put(const char *key, const char *value)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
return PMI2_KVS_Put(key, value);
|
||||
#else
|
||||
return PMI_KVS_Put(pmi_kvs_name, key, value);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int kvs_get(const char *key, char *value, int valuelen)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
int len;
|
||||
|
||||
return PMI2_KVS_Get(pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
|
||||
#else
|
||||
return PMI_KVS_Get(pmi_kvs_name, key, value, valuelen);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int kvs_commit(void)
|
||||
{
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
return PMI2_KVS_Fence())) {
|
||||
#else
|
||||
int rc;
|
||||
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) {
|
||||
return rc;
|
||||
}
|
||||
/* Barrier here to ensure all other procs have committed */
|
||||
return PMI_Barrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the module
|
||||
*/
|
||||
@ -146,11 +182,19 @@ static int pmi_barrier(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
/* Cray doesn't provide a barrier, so use the Fence function here */
|
||||
if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) {
|
||||
ORTE_PMI_ERROR(rc, "PMI2_KVS_Fence");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#else
|
||||
/* use the PMI barrier function */
|
||||
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_Barrier");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi barrier complete",
|
||||
@ -201,7 +245,7 @@ static int pmi_set_proc_attr(const char* attr_name,
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, pmi_attr_val);
|
||||
rc = kvs_put(pmi_kvs_key, pmi_attr_val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
@ -237,7 +281,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name,
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
|
||||
return ORTE_ERROR;
|
||||
@ -259,6 +303,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name,
|
||||
static int modex(opal_list_t *procs)
|
||||
{
|
||||
int rc, i;
|
||||
size_t len;
|
||||
char *rml_uri, val[64];
|
||||
orte_vpid_t v;
|
||||
orte_process_name_t name;
|
||||
@ -286,7 +331,7 @@ static int modex(opal_list_t *procs)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, orte_process_info.nodename);
|
||||
rc = kvs_put(pmi_kvs_key, orte_process_info.nodename);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
@ -302,9 +347,17 @@ static int modex(opal_list_t *procs)
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "RMLURI"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(rml_uri);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, rml_uri);
|
||||
/* NTH: some characters are not allowed in pmi2 land so we need to encode */
|
||||
if (ORTE_SUCCESS != (rc = pmi_encode(rml_uri, strlen(rml_uri)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(rml_uri);
|
||||
return rc;
|
||||
}
|
||||
/* encoding puts the encoded value in pmi_attr_val */
|
||||
rc = kvs_put(pmi_kvs_key, pmi_attr_val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
free(rml_uri);
|
||||
@ -327,13 +380,22 @@ static int modex(opal_list_t *procs)
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi LOCALE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale));
|
||||
/* NTH: some characters are not allowed in pmi2 land - not sure
|
||||
* if hwloc would use them, but just to be safe we need to encode
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = pmi_encode(locale, strlen(locale)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
return rc;
|
||||
}
|
||||
/* get the key */
|
||||
if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "HWLOC"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(locale);
|
||||
return rc;
|
||||
}
|
||||
/* enter the key-value */
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, locale);
|
||||
/* encoding puts the encoded value in pmi_attr_val */
|
||||
rc = kvs_put(pmi_kvs_key, pmi_attr_val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
free(locale);
|
||||
@ -355,7 +417,7 @@ static int modex(opal_list_t *procs)
|
||||
return rc;
|
||||
}
|
||||
snprintf(val, 64, "%lu", (unsigned long)pmap->local_rank);
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val);
|
||||
rc = kvs_put(pmi_kvs_key, val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
@ -365,23 +427,18 @@ static int modex(opal_list_t *procs)
|
||||
return rc;
|
||||
}
|
||||
snprintf(val, 64, "%lu", (unsigned long)pmap->node_rank);
|
||||
rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val);
|
||||
rc = kvs_put(pmi_kvs_key, val);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Put");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* commit our modex info */
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) {
|
||||
if (PMI_SUCCESS != (rc = kvs_commit())) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Commit failed");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Barrier here to ensure all other procs have committed */
|
||||
if (ORTE_SUCCESS != (rc = pmi_barrier())) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* harvest the oob endpoint info and hostname for all other procs
|
||||
* in our job so oob wireup can be completed and we
|
||||
* can setup their nidmap/pidmap
|
||||
@ -393,28 +450,37 @@ static int modex(opal_list_t *procs)
|
||||
continue;
|
||||
}
|
||||
name.vpid = v;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "RMLURI"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* Had to encode to protect against pmi2-prohibited chars */
|
||||
rml_uri = pmi_decode(&len);
|
||||
if (NULL == rml_uri) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi: proc %s oob endpoint %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name), pmi_attr_val));
|
||||
ORTE_NAME_PRINT(&name), rml_uri));
|
||||
/* set the contact info into the hash table */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(pmi_attr_val))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
|
||||
free(rml_uri);
|
||||
return rc;
|
||||
}
|
||||
free(rml_uri);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "HOSTNAME"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
|
||||
return ORTE_ERROR;
|
||||
@ -459,22 +525,22 @@ static int modex(opal_list_t *procs)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
pmap->local_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10);
|
||||
pmap->local_rank = (orte_local_rank_t)strtoul(pmi_attr_val, NULL, 10);
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "NODERANK"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
pmap->node_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10);
|
||||
pmap->node_rank = (orte_node_rank_t)strtoul(pmi_attr_val, NULL, 10);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi: proc %s lrank %u nrank %u",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -482,55 +548,66 @@ static int modex(opal_list_t *procs)
|
||||
(unsigned int)pmap->local_rank,
|
||||
(unsigned int)pmap->node_rank));
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the proc's locality info, if available */
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
/* don't error out here - if not found, that's okay */
|
||||
if (PMI_SUCCESS == rc) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) {
|
||||
/* if this data is from myself, then set locality to all */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale ALL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) {
|
||||
/* this is on a different node, then mark as non-local */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale NONLOCAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
} else if (0 == strlen(pmi_attr_val)){
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
* mark us as on the node as this is all we know
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale NODE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
/* convert the locale to a cpuset */
|
||||
if (NULL == orte_grpcomm_base.working_cpuset) {
|
||||
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
|
||||
{
|
||||
char *locale;
|
||||
|
||||
/* get the proc's locality info, if available */
|
||||
if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max);
|
||||
/* don't error out here - if not found, that's okay */
|
||||
if (PMI_SUCCESS == rc) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) {
|
||||
/* if this data is from myself, then set locality to all */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale ALL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) {
|
||||
/* this is on a different node, then mark as non-local */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale NONLOCAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_NON_LOCAL;
|
||||
} else if (0 == strlen(pmi_attr_val)){
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
* mark us as on the node as this is all we know
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcomm:pmi setting proc %s locale NODE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
pmap->locality = OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
/* we encoded to protect against pmi2 restrictions */
|
||||
locale = pmi_decode(&len);
|
||||
if (NULL == locale) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* convert the locale to a cpuset */
|
||||
if (NULL == orte_grpcomm_base.working_cpuset) {
|
||||
orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc();
|
||||
}
|
||||
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) {
|
||||
/* got a bad locale */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
free(locale);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
free(locale);
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
opal_hwloc_my_cpuset,
|
||||
orte_grpcomm_base.working_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcommpmi setting proc %s locale %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name), pmap->locality));
|
||||
}
|
||||
if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, pmi_attr_val)) {
|
||||
/* got a bad locale */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
/* determine relative location on our node */
|
||||
pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
opal_hwloc_my_cpuset,
|
||||
orte_grpcomm_base.working_cpuset);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
||||
"%s grpcommpmi setting proc %s locale %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name), pmap->locality));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -598,68 +675,56 @@ static void* pmi_decode(size_t *retlen) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* useful util */
|
||||
static char* pmi_error(int pmi_err)
|
||||
{
|
||||
char * err_msg;
|
||||
|
||||
switch(pmi_err) {
|
||||
case PMI_FAIL: err_msg = "Operation failed"; break;
|
||||
case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break;
|
||||
case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break;
|
||||
case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break;
|
||||
case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break;
|
||||
case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break;
|
||||
case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break;
|
||||
case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break;
|
||||
case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break;
|
||||
case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break;
|
||||
case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break;
|
||||
case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break;
|
||||
case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break;
|
||||
#if defined(PMI_ERR_INVALID_KVS)
|
||||
/* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */
|
||||
case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break;
|
||||
#endif
|
||||
case PMI_SUCCESS: err_msg = "Success"; break;
|
||||
default: err_msg = "Unkown error";
|
||||
}
|
||||
return err_msg;
|
||||
}
|
||||
|
||||
static int setup_pmi(void)
|
||||
{
|
||||
int max_length, rc;
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
pmi_vallen_max = PMI2_MAX_VALLEN;
|
||||
#else
|
||||
rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max);
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_Get_value_length_max");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_attr_val = malloc(pmi_vallen_max);
|
||||
if (NULL == pmi_attr_val) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
/* TODO -- is this ok */
|
||||
max_length = 1024;
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_kvs_name = malloc(max_length);
|
||||
if (NULL == pmi_kvs_name) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
rc = PMI2_Job_GetId(pmi_kvs_name, max_length);
|
||||
#else
|
||||
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
|
||||
#endif
|
||||
if (PMI_SUCCESS != rc) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_my_name");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
#if WANT_CRAY_PMI2_EXT
|
||||
pmi_keylen_max = PMI2_MAX_KEYLEN;
|
||||
#else
|
||||
if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) {
|
||||
ORTE_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
pmi_kvs_key = malloc(pmi_keylen_max);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user