From 3e72fccacfb20057311112992eb5576c6dbccccf Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 21 Oct 2011 04:54:38 +0000 Subject: [PATCH] Cray's PMI implementation is quite different from slurm's - they extended PMI-1 by adding some, but not all, of the PMI-2 APIs. So you can't just switch to using PMI-2 functions as it isn't a complete implementation. Instead, you have to selectively figure out which ones they have in PMI-2, and use any missing ones from PMI-1. What fun. Modify the configure logic and the PMI components to accommodate Cray's approach. Refactor the PMI error reporting code so it resides in only one place. Cray actually decided -not- to define the PMI-2 error codes, so we have to use the PMI-1 codes instead. More fun. This commit was SVN r25348. --- ompi/mca/pubsub/pmi/pubsub_pmi.c | 69 ++--- ompi/mca/pubsub/pmi/pubsub_pmi_component.c | 62 ++-- orte/config/orte_check_pmi.m4 | 44 ++- orte/mca/errmgr/base/errmgr_base_fns.c | 37 +++ orte/mca/errmgr/errmgr.h | 10 + orte/mca/ess/pmi/ess_pmi_component.c | 61 ++-- orte/mca/ess/pmi/ess_pmi_module.c | 38 --- orte/mca/grpcomm/pmi/grpcomm_pmi_component.c | 80 ++++-- orte/mca/grpcomm/pmi/grpcomm_pmi_module.c | 281 ++++++++++++------- 9 files changed, 425 insertions(+), 257 deletions(-) diff --git a/ompi/mca/pubsub/pmi/pubsub_pmi.c b/ompi/mca/pubsub/pmi/pubsub_pmi.c index 7c60a0f25f..7e9e269652 100644 --- a/ompi/mca/pubsub/pmi/pubsub_pmi.c +++ b/ompi/mca/pubsub/pmi/pubsub_pmi.c @@ -11,24 +11,19 @@ #include "ompi/constants.h" #include +#if WANT_CRAY_PMI2_EXT +#include +#endif #include "ompi/info/info.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "ompi/mca/pubsub/base/base.h" #include "pubsub_pmi.h" -static char* pmi_error(int pmi_err); -#define ORTE_PMI_ERROR(pmi_err, pmi_func) \ - do { \ - opal_output(0, "%s[%s:%d:%s] %s: %s\n", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__, \ - pmi_func, pmi_error(pmi_err)); \ - } while(0); - /* * Init the module */ @@ -44,11 +39,17 @@ static int publish ( char *service_name, ompi_info_t *info, char *port_name ) { int rc; +#if WANT_CRAY_PMI2_EXT + if (PMI2_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) { + ORTE_PMI_ERROR(rc, "PMI2_Nameserv_publish"); + return OMPI_ERROR; + } +#else if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) { ORTE_PMI_ERROR(rc, "PMI_KVS_Publish_name"); return OMPI_ERROR; } - +#endif return OMPI_SUCCESS; } @@ -57,11 +58,19 @@ static char* lookup ( char *service_name, ompi_info_t *info ) char *port=NULL; int rc; +#if WANT_CRAY_PMI2_EXT + port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */ + if (PMI2_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) { + ORTE_PMI_ERROR(rc, "PMI2_Nameserv_lookup"); + free(port); + return OMPI_ERROR; + } +#else if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) { ORTE_PMI_ERROR(rc, "PMI_Lookup_name"); return NULL; } - +#endif return port; } @@ -71,10 +80,17 @@ static int unpublish ( char *service_name, ompi_info_t *info ) { int rc; +#if WANT_CRAY_PMI2_EXT + if (PMI_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) { + ORTE_PMI_ERROR(rc, "PMI2_Nameserv_unpublish"); + return OMPI_ERROR; + } +#else if (PMI_SUCCESS != (rc = PMI_Unpublish_name(service_name))) { ORTE_PMI_ERROR(rc, "PMI_Unpublish_name"); return OMPI_ERROR; } +#endif return OMPI_SUCCESS;; } @@ -97,34 +113,3 @@ ompi_pubsub_base_module_t ompi_pubsub_pmi_module = { lookup, finalize }; - - -/* useful util */ -static char* pmi_error(int pmi_err) -{ - char * err_msg; - - switch(pmi_err) { - case PMI_FAIL: err_msg = "Operation failed"; break; - case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break; - case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break; - case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break; - case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break; - case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break; - case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break; - case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break; - case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break; - case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break; - case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break; - case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break; - case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break; - case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break; -#if defined(PMI_ERR_INVALID_KVS) - /* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */ - case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break; -#endif - case PMI_SUCCESS: err_msg = "Success"; break; - default: err_msg = "Unkown error"; - } - return err_msg; -} diff --git a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c index 70784e1cfe..44aa4e29eb 100644 --- a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c +++ b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c @@ -11,6 +11,9 @@ #include "ompi/constants.h" #include +#if WANT_CRAY_PMI2_EXT +#include +#endif #include "orte/util/proc_info.h" @@ -46,39 +49,62 @@ static int pubsub_pmi_component_open(void) static int pubsub_pmi_component_close(void) { +#if WANT_CRAY_PMI2_EXT + if (PMI2_Initialized()) { + PMI2_Finalize(); + } +#else PMI_BOOL initialized; - /* if we weren't selected, cleanup if necessary */ + /* if we weren't selected, cleanup */ if (PMI_SUCCESS == PMI_Initialized(&initialized) && PMI_TRUE == initialized) { PMI_Finalize(); } +#endif return OMPI_SUCCESS; } +static bool pmi_startup(void) +{ +#if WANT_CRAY_PMI2_EXT + int spawned, size, rank, appnum; + + if (PMI2_Initialized()) { + /* already initialized */ + return true; + } + /* if we can't startup PMI, we can't be used */ + if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) { + return false; + } + /* ignore the info - we'll pick it up elsewhere */ + return true; +#else + PMI_BOOL initialized; + + if (PMI_SUCCESS != PMI_Init(&initialized)) { + return false; + } + if (PMI_TRUE != initialized) { + if (PMI_SUCCESS != PMI_Init(&initialized)) { + return false; + } + } + + return true; +#endif +} + static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority) { - int spawned; - PMI_BOOL initialized; - /* for now, only use PMI when direct launched */ if (NULL == orte_process_info.my_hnp_uri && - PMI_SUCCESS == PMI_Initialized(&initialized)) { - /* if we aren't already initialized, then try */ - if (PMI_TRUE != initialized) { - /* if we can't startup the PMI, we can't be used */ - if (PMI_SUCCESS != PMI_Init(&spawned)) { - *priority = -1; - *module = NULL; - return OMPI_ERROR; - } - } - /* if we were able to startup PMI, or it was already - * running, then use us - */ + pmi_startup()) { + /* if PMI is available, use it */ *priority = 100; *module = (mca_base_module_t *)&ompi_pubsub_pmi_module; - return OMPI_SUCCESS; + return ORTE_SUCCESS; } /* we can't run */ diff --git a/orte/config/orte_check_pmi.m4 b/orte/config/orte_check_pmi.m4 index b827a43db2..1e26b79cac 100644 --- a/orte/config/orte_check_pmi.m4 +++ b/orte/config/orte_check_pmi.m4 @@ -27,8 +27,13 @@ AC_DEFUN([ORTE_CHECK_PMI],[ [AC_HELP_STRING([--with-pmi], [Build PMI support (default: no)])], [], with_pmi=no) + AC_ARG_WITH([cray-pmi2-ext], + [AC_HELP_STRING([--with-cray-pmi-ext], + [Include Cray PMI2 extensions (default: no)])], + [], with_cray_pmi2_ext=no) orte_enable_pmi=0 + orte_use_cray_pmi2_ext=0 # save flags orte_check_pmi_$1_save_CPPFLAGS="$CPPFLAGS" @@ -42,7 +47,8 @@ AC_DEFUN([ORTE_CHECK_PMI],[ AC_MSG_CHECKING([if user requested PMI support]) AS_IF([test "$with_pmi" = "no"], [AC_MSG_RESULT([no]) - orte_want_pmi_support=no], + orte_want_pmi_support=no + orte_use_cray_pmi2_ext=0], [AC_MSG_RESULT([yes]) orte_want_pmi_support=yes AC_MSG_CHECKING([if PMI support installed]) @@ -53,13 +59,13 @@ AC_DEFUN([ORTE_CHECK_PMI],[ [AS_IF([test -d "$with_pmi/lib64"], [orte_check_pmi_$1_LDFLAGS="-L$with_pmi/lib64"], [orte_check_pmi_$1_LDFLAGS="-L$with_pmi/lib"]) - AS_IF([test -f "$with_pmi/include/pmi.h"], - [orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include"], - [AS_IF([test -f "$with_pmi/include/slurm/pmi.h"], - [orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm"], - [])])], + AS_IF([test -f "$with_pmi/include/pmi.h"], + [orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include"], + [AS_IF([test -f "$with_pmi/include/slurm/pmi.h"], + [orte_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm"])])], [AS_IF([test -f "/usr/include/slurm/pmi.h"], [orte_check_pmi_$1_CPPFLAGS="-I/usr/include/slurm"])]) + LDFLAGS="$LDFLAGS $orte_check_pmi_$1_LDFLAGS" CPPFLAGS="$CPPFLAGS $orte_check_pmi_$1_CPPFLAGS" LIBS="$LIBS -lpmi" @@ -78,15 +84,35 @@ AC_DEFUN([ORTE_CHECK_PMI],[ [AC_MSG_RESULT([no]) AC_MSG_WARN([PMI support requested (via --with-pmi) but not found.]) AC_MSG_ERROR([Aborting.]) - $3])]) + $3]) - # restore flags - CPPFLAGS="$orte_check_pmi_$1_save_CPPFLAGS" + AC_MSG_CHECKING([if user requested Cray PMI2 extensions]) + AS_IF([test "$with_cray_pmi2_ext" = "no"], + [AC_MSG_RESULT([no]) + orte_use_pmi2_ext=0], + [AC_MSG_RESULT([yes]) + # check to see if pmi2.h header is present. if it is, then we + # will use some of the functions in it. + AC_MSG_CHECKING([if PMI2 extensions installed]) + AS_IF([test -f "$with_pmi/include/pmi2.h"], + [orte_use_pmi2_ext=1], + [AC_MSG_RESULT([no]) + AC_MSG_WARN([PMI2 extensions requested (via --with-cray-pmi2-ext) but not found.]) + AC_MSG_ERROR([Aborting.]) + orte_use_pmi2_ext=0 + orte_enable_pmi=0 + $3])])]) + + # restore flags - have to add CPPFLAGS so base functions can find pmi.h + CPPFLAGS="$orte_check_pmi_$1_save_CPPFLAGS $orte_check_pmi_$1_CPPFLAGS" LDFLAGS="$orte_check_pmi_$1_save_LDFLAGS" LIBS="$orte_check_pmi_$1_save_LIBS" AC_DEFINE_UNQUOTED([WANT_PMI_SUPPORT], [$orte_enable_pmi], [Whether we want PMI support]) + AC_DEFINE_UNQUOTED([WANT_CRAY_PMI2_EXT], + [$orte_use_pmi2_ext], + [Whether we want to use Cray PMI2 extensions]) AM_CONDITIONAL(WANT_PMI_SUPPORT, [test "$orte_enable_pmi" = 1]) ]) diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index ffb5b0e4c7..4215eb65c7 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -45,6 +45,10 @@ #include #include +#if WANT_PMI_SUPPORT +#include +#endif + #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" @@ -209,6 +213,39 @@ void orte_errmgr_base_log(int error_code, char *filename, int line) } } +#if WANT_PMI_SUPPORT +/* useful util */ +char* orte_errmgr_base_pmi_error(int pmi_err) +{ + char * err_msg; + + switch(pmi_err) { + case PMI_FAIL: err_msg = "Operation failed"; break; + case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break; + case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break; + case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break; + case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break; + case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break; + case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break; + case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break; + case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break; + case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break; + case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break; + case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break; + case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break; + case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break; +#if defined(PMI_ERR_INVALID_KVS) + /* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */ + case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break; +#endif + case PMI_SUCCESS: err_msg = "Success"; break; + default: err_msg = "Unkown error"; + } + return err_msg; +} +#endif + + void orte_errmgr_base_abort(int error_code, char *fmt, ...) { va_list arglist; diff --git a/orte/mca/errmgr/errmgr.h b/orte/mca/errmgr/errmgr.h index 8c65412df2..24a6de292a 100644 --- a/orte/mca/errmgr/errmgr.h +++ b/orte/mca/errmgr/errmgr.h @@ -150,6 +150,16 @@ OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t); #define ORTE_ERROR_LOG(n) \ orte_errmgr.log(n, __FILE__, __LINE__); +#if WANT_PMI_SUPPORT +#define ORTE_PMI_ERROR(pmi_err, pmi_func) \ + do { \ + opal_output(0, "%s[%s:%d:%s] %s: %s\n", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, __func__, \ + pmi_func, orte_errmgr_base_pmi_error(pmi_err)); \ + } while(0); +OPAL_DECLSPEC char* orte_errmgr_base_pmi_error(int pmi_err); +#endif /* * Framework Interfaces diff --git a/orte/mca/ess/pmi/ess_pmi_component.c b/orte/mca/ess/pmi/ess_pmi_component.c index ad46db39eb..4c7e0d8f37 100644 --- a/orte/mca/ess/pmi/ess_pmi_component.c +++ b/orte/mca/ess/pmi/ess_pmi_component.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All + * rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,6 +19,9 @@ #include "orte/constants.h" #include +#if WANT_CRAY_PMI2_EXT +#include +#endif #include "orte/util/proc_info.h" @@ -60,28 +65,46 @@ static int pmi_component_open(void) return ORTE_SUCCESS; } +static bool pmi_startup(void) +{ +#if WANT_CRAY_PMI2_EXT + int spawned, size, rank, appnum; + + if (PMI2_Initialized()) { + /* already initialized */ + return true; + } + /* if we can't startup PMI, we can't be used */ + if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) { + return false; + } + /* ignore the info - we'll pick it up elsewhere */ + return true; +#else + PMI_BOOL initialized; + + if (PMI_SUCCESS != PMI_Initialized(&initialized)) { + return false; + } + if (PMI_TRUE != initialized) { + if (PMI_SUCCESS != PMI_Init(&initialized)) { + return false; + } + } + return true; +#endif +} static int pmi_component_query(mca_base_module_t **module, int *priority) { - int spawned; - PMI_BOOL initialized; - /* for now, only use PMI when direct launched */ if (!ORTE_PROC_IS_HNP && NULL == orte_process_info.my_hnp_uri && - PMI_SUCCESS == PMI_Initialized(&initialized)) { - if (PMI_TRUE != initialized) { - /* if we can't startup the PMI, we can't be used */ - if (PMI_SUCCESS != PMI_Init(&spawned)) { - *priority = -1; - *module = NULL; - return ORTE_ERROR; - } - /* if PMI is available, use it */ - *priority = 100; - *module = (mca_base_module_t *)&orte_ess_pmi_module; - return ORTE_SUCCESS; - } + pmi_startup()) { + /* if PMI is available, use it */ + *priority = 100; + *module = (mca_base_module_t *)&orte_ess_pmi_module; + return ORTE_SUCCESS; } /* we can't run */ @@ -93,6 +116,11 @@ static int pmi_component_query(mca_base_module_t **module, int *priority) static int pmi_component_close(void) { +#if WANT_CRAY_PMI2_EXT + if (PMI2_Initialized()) { + PMI2_Finalize(); + } +#else PMI_BOOL initialized; /* if we weren't selected, cleanup */ @@ -100,6 +128,7 @@ static int pmi_component_close(void) PMI_TRUE == initialized) { PMI_Finalize(); } +#endif return ORTE_SUCCESS; } diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index cc7ac5eb95..9d3df602ba 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -80,14 +80,6 @@ orte_ess_base_module_t orte_ess_pmi_module = { static bool app_init_complete=false; static int pmi_maxlen=0; -static char* pmi_error(int pmi_err); -#define ORTE_PMI_ERROR(pmi_err, pmi_func) \ - do { \ - opal_output(0, "%s[%s:%d:%s] %s: %s\n", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__, \ - pmi_func, pmi_error(pmi_err)); \ - } while(0); /**** MODULE FUNCTIONS ****/ @@ -307,33 +299,3 @@ static void rte_abort(int error_code, bool report) { orte_ess_base_app_abort(error_code, report); } - -/* useful util */ -static char* pmi_error(int pmi_err) -{ - char * err_msg; - - switch(pmi_err) { - case PMI_FAIL: err_msg = "Operation failed"; break; - case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break; - case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break; - case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break; - case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break; - case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break; - case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break; - case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break; - case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break; - case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break; - case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break; - case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break; - case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break; - case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break; -#if defined(PMI_ERR_INVALID_KVS) - /* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */ - case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break; -#endif - case PMI_SUCCESS: err_msg = "Success"; break; - default: err_msg = "Unkown error"; - } - return err_msg; -} diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c index 00412bcbdf..c9b1d3b876 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c @@ -1,17 +1,22 @@ /* -*- C -*- -* -* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. -* $COPYRIGHT$ -* -* Additional copyrights may follow -* -* $HEADER$ -*/ + * + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All + * rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ #include "orte_config.h" #include "orte/constants.h" #include +#if WANT_CRAY_PMI2_EXT +#include +#endif #include "opal/mca/mca.h" #include "opal/mca/base/mca_base_param.h" @@ -50,40 +55,63 @@ int orte_grpcomm_pmi_open(void) int orte_grpcomm_pmi_close(void) { +#if WANT_CRAY_PMI2_EXT + if (PMI2_Initialized()) { + PMI2_Finalize(); + } +#else PMI_BOOL initialized; - /* if we weren't selected, cleanup if necessary */ + /* if we weren't selected, cleanup */ if (PMI_SUCCESS == PMI_Initialized(&initialized) && PMI_TRUE == initialized) { PMI_Finalize(); } +#endif + return ORTE_SUCCESS; } +static bool pmi_startup(void) +{ +#if WANT_CRAY_PMI2_EXT + int spawned, size, rank, appnum; + + if (PMI2_Initialized()) { + /* already initialized */ + return true; + } + /* if we can't startup PMI, we can't be used */ + if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) { + return false; + } + /* ignore the info - we'll pick it up elsewhere */ + return true; +#else + PMI_BOOL initialized; + + if (PMI_SUCCESS != PMI_Init(&initialized)) { + return false; + } + if (PMI_TRUE != initialized) { + if (PMI_SUCCESS != PMI_Init(&initialized)) { + return false; + } + } + return true; +#endif +} + int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority) { - int spawned; - PMI_BOOL initialized; - /* for now, only use PMI when direct launched */ if (!ORTE_PROC_IS_HNP && NULL == orte_process_info.my_hnp_uri && - PMI_SUCCESS == PMI_Initialized(&initialized)) { - /* if we aren't already initialized, then try */ - if (PMI_TRUE != initialized) { - /* if we can't startup the PMI, we can't be used */ - if (PMI_SUCCESS != PMI_Init(&spawned)) { - *priority = -1; - *module = NULL; - return ORTE_ERROR; - } - } - /* if we were able to startup PMI, or it was already - * running, then use us - */ + pmi_startup()) { + /* if PMI is available, use it */ *priority = 100; *module = (mca_base_module_t *)&orte_grpcomm_pmi_module; - return ORTE_SUCCESS; + return ORTE_SUCCESS; } /* we can't run */ diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index 7095888e62..ca29109303 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -2,6 +2,8 @@ * Copyright (c) 2007 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All + * rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -15,6 +17,9 @@ #include #include +#if WANT_CRAY_PMI2_EXT +#include +#endif #include "opal/dss/dss.h" #include "opal/mca/hwloc/base/base.h" @@ -62,14 +67,6 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = { static int pmi_encode(const void *val, size_t vallen); static void* pmi_decode(size_t *retlen); -static char* pmi_error(int pmi_err); -#define ORTE_PMI_ERROR(pmi_err, pmi_func) \ - do { \ - opal_output(0, "%s[%s:%d:%s] %s: %s\n", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__, \ - pmi_func, pmi_error(pmi_err)); \ - } while(0); static int setup_pmi(void); static int setup_key(const orte_process_name_t *name, const char *key); @@ -80,6 +77,45 @@ static char *pmi_attr_val = NULL; static int pmi_vallen_max = -1; static int pmi_keylen_max = -1; +/* Because Cray uses PMI2 extensions for some, but not all, + * PMI functions, we define a set of wrappers for those + * common functions we will use + */ +static int kvs_put(const char *key, const char *value) +{ +#if WANT_CRAY_PMI2_EXT + return PMI2_KVS_Put(key, value); +#else + return PMI_KVS_Put(pmi_kvs_name, key, value); +#endif +} + +static int kvs_get(const char *key, char *value, int valuelen) +{ +#if WANT_CRAY_PMI2_EXT + int len; + + return PMI2_KVS_Get(pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len); +#else + return PMI_KVS_Get(pmi_kvs_name, key, value, valuelen); +#endif +} + +static int kvs_commit(void) +{ +#if WANT_CRAY_PMI2_EXT + return PMI2_KVS_Fence())) { +#else + int rc; + + if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) { + return rc; + } + /* Barrier here to ensure all other procs have committed */ + return PMI_Barrier(); +#endif +} + /** * Initialize the module */ @@ -146,11 +182,19 @@ static int pmi_barrier(void) return ORTE_SUCCESS; } +#if WANT_CRAY_PMI2_EXT + /* Cray doesn't provide a barrier, so use the Fence function here */ + if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { + ORTE_PMI_ERROR(rc, "PMI2_KVS_Fence"); + return ORTE_ERROR; + } +#else /* use the PMI barrier function */ if (PMI_SUCCESS != (rc = PMI_Barrier())) { ORTE_PMI_ERROR(rc, "PMI_Barrier"); return ORTE_ERROR; } +#endif OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi barrier complete", @@ -201,7 +245,7 @@ static int pmi_set_proc_attr(const char* attr_name, return rc; } - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, pmi_attr_val); + rc = kvs_put(pmi_kvs_key, pmi_attr_val); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); return ORTE_ERROR; @@ -237,7 +281,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name, return rc; } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get"); return ORTE_ERROR; @@ -259,6 +303,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name, static int modex(opal_list_t *procs) { int rc, i; + size_t len; char *rml_uri, val[64]; orte_vpid_t v; orte_process_name_t name; @@ -286,7 +331,7 @@ static int modex(opal_list_t *procs) ORTE_ERROR_LOG(rc); return rc; } - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, orte_process_info.nodename); + rc = kvs_put(pmi_kvs_key, orte_process_info.nodename); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); return ORTE_ERROR; @@ -302,9 +347,17 @@ static int modex(opal_list_t *procs) } if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "RMLURI"))) { ORTE_ERROR_LOG(rc); + free(rml_uri); return rc; } - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, rml_uri); + /* NTH: some characters are not allowed in pmi2 land so we need to encode */ + if (ORTE_SUCCESS != (rc = pmi_encode(rml_uri, strlen(rml_uri)))) { + ORTE_ERROR_LOG(rc); + free(rml_uri); + return rc; + } + /* encoding puts the encoded value in pmi_attr_val */ + rc = kvs_put(pmi_kvs_key, pmi_attr_val); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); free(rml_uri); @@ -327,13 +380,22 @@ static int modex(opal_list_t *procs) OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi LOCALE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), locale)); + /* NTH: some characters are not allowed in pmi2 land - not sure + * if hwloc would use them, but just to be safe we need to encode + */ + if (ORTE_SUCCESS != (rc = pmi_encode(locale, strlen(locale)))) { + ORTE_ERROR_LOG(rc); + free(locale); + return rc; + } /* get the key */ if (ORTE_SUCCESS != (rc = setup_key(ORTE_PROC_MY_NAME, "HWLOC"))) { ORTE_ERROR_LOG(rc); + free(locale); return rc; } - /* enter the key-value */ - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, locale); + /* encoding puts the encoded value in pmi_attr_val */ + rc = kvs_put(pmi_kvs_key, pmi_attr_val); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); free(locale); @@ -355,7 +417,7 @@ static int modex(opal_list_t *procs) return rc; } snprintf(val, 64, "%lu", (unsigned long)pmap->local_rank); - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val); + rc = kvs_put(pmi_kvs_key, val); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); return ORTE_ERROR; @@ -365,23 +427,18 @@ static int modex(opal_list_t *procs) return rc; } snprintf(val, 64, "%lu", (unsigned long)pmap->node_rank); - rc = PMI_KVS_Put(pmi_kvs_name, pmi_kvs_key, val); + rc = kvs_put(pmi_kvs_key, val); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Put"); return ORTE_ERROR; } /* commit our modex info */ - if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) { + if (PMI_SUCCESS != (rc = kvs_commit())) { ORTE_PMI_ERROR(rc, "PMI_KVS_Commit failed"); return ORTE_ERROR; } - /* Barrier here to ensure all other procs have committed */ - if (ORTE_SUCCESS != (rc = pmi_barrier())) { - return rc; - } - /* harvest the oob endpoint info and hostname for all other procs * in our job so oob wireup can be completed and we * can setup their nidmap/pidmap @@ -393,28 +450,37 @@ static int modex(opal_list_t *procs) continue; } name.vpid = v; + if (ORTE_SUCCESS != (rc = setup_key(&name, "RMLURI"))) { ORTE_ERROR_LOG(rc); return rc; } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get"); return ORTE_ERROR; } + /* Had to encode to protect against pmi2-prohibited chars */ + rml_uri = pmi_decode(&len); + if (NULL == rml_uri) { + return ORTE_ERROR; + } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi: proc %s oob endpoint %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), pmi_attr_val)); + ORTE_NAME_PRINT(&name), rml_uri)); /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(pmi_attr_val))) { + if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) { + free(rml_uri); return rc; } + free(rml_uri); + if (ORTE_SUCCESS != (rc = setup_key(&name, "HOSTNAME"))) { ORTE_ERROR_LOG(rc); return rc; } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get"); return ORTE_ERROR; @@ -459,22 +525,22 @@ static int modex(opal_list_t *procs) ORTE_ERROR_LOG(rc); return rc; } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get"); return ORTE_ERROR; } - pmap->local_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10); + pmap->local_rank = (orte_local_rank_t)strtoul(pmi_attr_val, NULL, 10); if (ORTE_SUCCESS != (rc = setup_key(&name, "NODERANK"))) { ORTE_ERROR_LOG(rc); return rc; } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get"); return ORTE_ERROR; } - pmap->node_rank = (uint16_t)strtoul(pmi_attr_val, NULL, 10); + pmap->node_rank = (orte_node_rank_t)strtoul(pmi_attr_val, NULL, 10); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi: proc %s lrank %u nrank %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -482,55 +548,66 @@ static int modex(opal_list_t *procs) (unsigned int)pmap->local_rank, (unsigned int)pmap->node_rank)); #if OPAL_HAVE_HWLOC - /* get the proc's locality info, if available */ - if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) { - ORTE_ERROR_LOG(rc); - return rc; - } - rc = PMI_KVS_Get(pmi_kvs_name, pmi_kvs_key, pmi_attr_val, pmi_vallen_max); - /* don't error out here - if not found, that's okay */ - if (PMI_SUCCESS == rc) { - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) { - /* if this data is from myself, then set locality to all */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale ALL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - pmap->locality = OPAL_PROC_ALL_LOCAL; - } else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) { - /* this is on a different node, then mark as non-local */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NONLOCAL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - pmap->locality = OPAL_PROC_NON_LOCAL; - } else if (0 == strlen(pmi_attr_val)){ - /* if we share a node, but we don't know anything more, then - * mark us as on the node as this is all we know - */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NODE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - pmap->locality = OPAL_PROC_ON_NODE; - } else { - /* convert the locale to a cpuset */ - if (NULL == orte_grpcomm_base.working_cpuset) { - orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc(); + { + char *locale; + + /* get the proc's locality info, if available */ + if (ORTE_SUCCESS != (rc = setup_key(&name, "HWLOC"))) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); + /* don't error out here - if not found, that's okay */ + if (PMI_SUCCESS == rc) { + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) { + /* if this data is from myself, then set locality to all */ + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcomm:pmi setting proc %s locale ALL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + pmap->locality = OPAL_PROC_ALL_LOCAL; + } else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) { + /* this is on a different node, then mark as non-local */ + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcomm:pmi setting proc %s locale NONLOCAL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + pmap->locality = OPAL_PROC_NON_LOCAL; + } else if (0 == strlen(pmi_attr_val)){ + /* if we share a node, but we don't know anything more, then + * mark us as on the node as this is all we know + */ + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcomm:pmi setting proc %s locale NODE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + pmap->locality = OPAL_PROC_ON_NODE; + } else { + /* we encoded to protect against pmi2 restrictions */ + locale = pmi_decode(&len); + if (NULL == locale) { + return ORTE_ERROR; + } + /* convert the locale to a cpuset */ + if (NULL == orte_grpcomm_base.working_cpuset) { + orte_grpcomm_base.working_cpuset = hwloc_bitmap_alloc(); + } + if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, locale)) { + /* got a bad locale */ + ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); + free(locale); + return ORTE_ERR_VALUE_OUT_OF_BOUNDS; + } + free(locale); + /* determine relative location on our node */ + pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, + opal_hwloc_my_cpuset, + orte_grpcomm_base.working_cpuset); + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcommpmi setting proc %s locale %04x", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name), pmap->locality)); } - if (0 != hwloc_bitmap_list_sscanf(orte_grpcomm_base.working_cpuset, pmi_attr_val)) { - /* got a bad locale */ - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - return ORTE_ERR_VALUE_OUT_OF_BOUNDS; - } - /* determine relative location on our node */ - pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, - opal_hwloc_my_cpuset, - orte_grpcomm_base.working_cpuset); - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcommpmi setting proc %s locale %04x", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), pmap->locality)); } } #endif @@ -598,68 +675,56 @@ static void* pmi_decode(size_t *retlen) { return ret; } -/* useful util */ -static char* pmi_error(int pmi_err) -{ - char * err_msg; - - switch(pmi_err) { - case PMI_FAIL: err_msg = "Operation failed"; break; - case PMI_ERR_INIT: err_msg = "PMI is not initialized"; break; - case PMI_ERR_NOMEM: err_msg = "Input buffer not large enough"; break; - case PMI_ERR_INVALID_ARG: err_msg = "Invalid argument"; break; - case PMI_ERR_INVALID_KEY: err_msg = "Invalid key argument"; break; - case PMI_ERR_INVALID_KEY_LENGTH: err_msg = "Invalid key length argument"; break; - case PMI_ERR_INVALID_VAL: err_msg = "Invalid value argument"; break; - case PMI_ERR_INVALID_VAL_LENGTH: err_msg = "Invalid value length argument"; break; - case PMI_ERR_INVALID_LENGTH: err_msg = "Invalid length argument"; break; - case PMI_ERR_INVALID_NUM_ARGS: err_msg = "Invalid number of arguments"; break; - case PMI_ERR_INVALID_ARGS: err_msg = "Invalid args argument"; break; - case PMI_ERR_INVALID_NUM_PARSED: err_msg = "Invalid num_parsed length argument"; break; - case PMI_ERR_INVALID_KEYVALP: err_msg = "Invalid invalid keyvalp atgument"; break; - case PMI_ERR_INVALID_SIZE: err_msg = "Invalid size argument"; break; -#if defined(PMI_ERR_INVALID_KVS) - /* pmi.h calls this a valid return code but mpich doesn't define it (slurm does). wtf */ - case PMI_ERR_INVALID_KVS: err_msg = "Invalid kvs argument"; break; -#endif - case PMI_SUCCESS: err_msg = "Success"; break; - default: err_msg = "Unkown error"; - } - return err_msg; -} - static int setup_pmi(void) { int max_length, rc; +#if WANT_CRAY_PMI2_EXT + pmi_vallen_max = PMI2_MAX_VALLEN; +#else rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max); if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_Get_value_length_max"); return ORTE_ERROR; } +#endif pmi_attr_val = malloc(pmi_vallen_max); if (NULL == pmi_attr_val) { return ORTE_ERR_OUT_OF_RESOURCE; } +#if WANT_CRAY_PMI2_EXT + /* TODO -- is this ok */ + max_length = 1024; +#else if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max"); return ORTE_ERROR; } +#endif pmi_kvs_name = malloc(max_length); if (NULL == pmi_kvs_name) { return ORTE_ERR_OUT_OF_RESOURCE; } + +#if WANT_CRAY_PMI2_EXT + rc = PMI2_Job_GetId(pmi_kvs_name, max_length); +#else rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length); +#endif if (PMI_SUCCESS != rc) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get_my_name"); return ORTE_ERROR; } +#if WANT_CRAY_PMI2_EXT + pmi_keylen_max = PMI2_MAX_KEYLEN; +#else if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) { ORTE_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max"); return ORTE_ERROR; } +#endif pmi_kvs_key = malloc(pmi_keylen_max); return ORTE_SUCCESS;