From e03bc93fb7d8579e07c307285128951074b35a5d Mon Sep 17 00:00:00 2001 From: Samuel Gutierrez Date: Sun, 6 Nov 2011 17:28:40 +0000 Subject: [PATCH] only use pmi grpcomm and pubsub during the direct launch case. use PMI environment variable to setup vpid in ess alps on cray xe systems. add pmi test code. This commit was SVN r25447. --- ompi/mca/pubsub/pmi/pubsub_pmi_component.c | 3 +- orte/mca/ess/alps/configure.m4 | 20 ----- orte/mca/ess/alps/ess_alps_component.c | 26 +++++-- orte/mca/ess/alps/ess_alps_module.c | 32 ++++---- orte/mca/grpcomm/pmi/grpcomm_pmi_component.c | 4 +- orte/test/system/getenv_pmi.c | 81 ++++++++++++++++++++ 6 files changed, 123 insertions(+), 43 deletions(-) create mode 100644 orte/test/system/getenv_pmi.c diff --git a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c index c36db09ee5..15a650d0df 100644 --- a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c +++ b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c @@ -115,7 +115,8 @@ static bool pmi_startup(void) static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority) { /* for now, only use PMI when direct launched */ - if (ORTE_PROC_IS_MPI && + if (NULL == orte_process_info.my_hnp_uri && + ORTE_PROC_IS_MPI && pmi_startup()) { /* if PMI is available, use it */ *priority = my_priority; diff --git a/orte/mca/ess/alps/configure.m4 b/orte/mca/ess/alps/configure.m4 index e2662e4e6d..72bdc38255 100644 --- a/orte/mca/ess/alps/configure.m4 +++ b/orte/mca/ess/alps/configure.m4 @@ -48,35 +48,15 @@ AC_DEFUN([MCA_orte_ess_alps_CONFIG],[ [orte_mca_ess_alps_have_cnos=1], [orte_mca_ess_alps_have_cnos=0])]) - dnl now check for PMI support - ORTE_CHECK_PMI([ess_alps], - [orte_mca_ess_alps_have_pmi=1], - [orte_mca_ess_alps_have_pmi=0]) - dnl was ess alps requested? ORTE_CHECK_ALPS([ess_alps], [orte_mca_ess_alps_happy="yes"], [orte_mca_ess_alps_happy="no"]) - dnl cannot continue if we don't have CNOS or PMI - AS_IF([test "$orte_mca_ess_alps_happy" = "yes" -a "$orte_mca_ess_alps_have_cnos" = "0" -a "$orte_mca_ess_alps_have_pmi" = "0"], - [AC_MSG_WARN([Alps support requested (via --with-alps) but adequate support was not found.]) - AC_MSG_ERROR([Cannot continue.])]) - - dnl cannot continue if we have both CNOS and PMI. this will probably - dnl never happen, but it can't hurt to also check for this case. - AS_IF([test "$orte_mca_ess_alps_happy" = "yes" -a "$orte_mca_ess_alps_have_cnos" = "1" -a "$orte_mca_ess_alps_have_pmi" = "1"], - [AC_MSG_WARN([Alps support requested (via --with-alps) but CNOS and PMI support was found.]) - AC_MSG_ERROR([Cannot continue.])]) - AC_DEFINE_UNQUOTED([ORTE_MCA_ESS_ALPS_HAVE_CNOS], [$orte_mca_ess_alps_have_cnos], [Whether we have CNOS support in alps ess or not]) - AC_DEFINE_UNQUOTED([ORTE_MCA_ESS_ALPS_HAVE_PMI], - [$orte_mca_ess_alps_have_pmi], - [Whether we have PMI support in alps ess or not]) - AS_IF([test "$orte_mca_ess_alps_happy" = "yes"], [$1], [$2]) diff --git a/orte/mca/ess/alps/ess_alps_component.c b/orte/mca/ess/alps/ess_alps_component.c index 13dcd725b4..1a798c0428 100644 --- a/orte/mca/ess/alps/ess_alps_component.c +++ b/orte/mca/ess/alps/ess_alps_component.c @@ -5,14 +5,16 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker @@ -59,22 +61,34 @@ orte_ess_base_component_t mca_ess_alps_component = { } }; - int orte_ess_alps_component_open(void) { return ORTE_SUCCESS; } - int orte_ess_alps_component_query(mca_base_module_t **module, int *priority) { +#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 *priority = 35; *module = (mca_base_module_t *)&orte_ess_alps_module; return ORTE_SUCCESS; +#else + /* if i'm a daemon, then only i can safely select this component if + * PMI_GNI_LOC_ADDR exists */ + if (NULL != getenv("PMI_GNI_LOC_ADDR") && + ORTE_PROC_IS_DAEMON) { + *priority = 35; + *module = (mca_base_module_t *)&orte_ess_alps_module; + return ORTE_SUCCESS; + } + /* can't be selected, so disqualify myself */ + *priority = -1; + *module = NULL; + return ORTE_ERROR; +#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */ } - int orte_ess_alps_component_close(void) { diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 66184c2263..f109157acd 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -29,8 +29,6 @@ # elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H) # include "catamount/cnos_mpi_os.h" # endif -#elif ORTE_MCA_ESS_ALPS_HAVE_PMI == 1 -# include "pmi.h" #endif #include "orte/util/show_help.h" @@ -47,6 +45,8 @@ #include "orte/mca/ess/base/base.h" #include "orte/mca/ess/alps/ess_alps.h" +#include + static int alps_set_name(void); static int rte_init(void); static int rte_finalize(void); @@ -78,23 +78,25 @@ get_vpid(orte_vpid_t *outvp, #if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 *outvp = (orte_vpid_t)cnos_get_rank() + start_vpid; return ORTE_SUCCESS; -#else /* using PMI */ - /* TODO SKG - PMI utility functions should be in a common area */ - int rank; - PMI_BOOL pmi_initialized; +#else + /* Cray XE6 Notes: + * using PMI_GNI_LOC_ADDR to set vpid. + */ + int rank = 0; + char *env; - if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized)) { + if (NULL == (env = getenv("PMI_GNI_LOC_ADDR"))) { + OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output, + "PMI_GNI_LOC_ADDR not found, cannot continue\n")); ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } - if (PMI_FALSE == pmi_initialized) { - int tmp; - if (PMI_SUCCESS != PMI_Init(&tmp)) { - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - } - if (PMI_SUCCESS != PMI_Get_rank(&rank)) { + errno = 0; + rank = (int)strtol(env, (char **)NULL, 10); + if (0 != errno) { + OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output, + "strtol error detected at %s:%d\n", __FILE__, + __LINE__)); ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c index 17796b6f82..ed7d4b9c70 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c @@ -122,7 +122,9 @@ static bool pmi_startup(void) int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority) { - if (ORTE_PROC_IS_MPI && + /* only use PMI when direct launched */ + if (NULL == orte_process_info.my_hnp_uri && + ORTE_PROC_IS_MPI && pmi_startup()) { /* if PMI is available, make it available for use by MPI procs */ *priority = my_priority; diff --git a/orte/test/system/getenv_pmi.c b/orte/test/system/getenv_pmi.c new file mode 100644 index 0000000000..698863953e --- /dev/null +++ b/orte/test/system/getenv_pmi.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + */ + +#include +#include +#include +#include + +#include "pmi.h" + +/* NOTES + * + * useful debug environment variables: + * PMI_DEBUG + */ + +int main(int argc, char **argv, char **envp) +{ + int i; + int pmi_rank = -1; + int pmi_process_group_size = -1; + int num_local_procs = 0; + int *local_rank_ids = NULL; + int spawned = PMI_FALSE; + int rc = EXIT_FAILURE; + pid_t pid = 0; + char *err = NULL; + PMI_BOOL pmi_initialized = PMI_FALSE; + + /* sanity */ + if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized) || + PMI_TRUE == pmi_initialized) { + fprintf(stderr, "=== ERROR: PMI sanity failure\n"); + return EXIT_FAILURE; + } + if (PMI_SUCCESS != PMI_Init(&spawned)) { + err = "PMI_Init failure!"; + goto done; + } + if (PMI_SUCCESS != PMI_Get_size(&pmi_process_group_size)) { + err = "PMI_Get_size failure!"; + goto done; + } + if (PMI_SUCCESS != PMI_Get_rank(&pmi_rank)) { + err = "PMI_Get_rank failure!"; + goto done; + } + if (PMI_SUCCESS != PMI_Get_clique_size(&num_local_procs)) { + err = "PMI_Get_clique_size failure!"; + goto done; + } + if (NULL == (local_rank_ids = calloc(num_local_procs, sizeof(int)))) { + err = "out of resources"; + goto done; + } + if (PMI_SUCCESS != PMI_Get_clique_ranks(local_rank_ids, num_local_procs)) { + err = "PMI_Get_clique_size failure!"; + goto done; + } + /* lowest local rank will print env info and tag its output*/ + if (pmi_rank == local_rank_ids[0]) { + for (; NULL != envp && NULL != *envp; ++envp) { + printf("===[%d]: %s\n", pmi_rank, *envp); + } + } + + rc = EXIT_SUCCESS; + +done: + if (PMI_TRUE == pmi_initialized) { + if (PMI_SUCCESS != PMI_Finalize()) { + err = "PMI_Finalize failure!"; + } + } + if (NULL != err) { + fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err); + } + return rc; +}