From 0b5c1f2ea893cd74c42ab6988aa889e6fe291104 Mon Sep 17 00:00:00 2001 From: Joshua Ladd Date: Fri, 21 Jun 2013 15:28:14 +0000 Subject: [PATCH] Add 'generic' support for PMI2 (previously, we checked for PMI2 only on Cray systems.) If your resource manager (e.g. SLURM) has support for PMI2, then the --with-pmi configure flag will enable its usage. If you don't have PMI2, then you will fallback to regular old PMI1. This patch was submitted by Ralph Castain and reviewed and pushed by Josh Ladd. This should be added to cmr:v1.7:reviewer=jladd This commit was SVN r28666. --- config/opal_check_pmi.m4 | 86 ++++++++++---------- contrib/platform/lanl/cray_xe6/cray-common | 1 - ompi/mca/pubsub/pmi/pubsub_pmi.c | 8 +- ompi/mca/pubsub/pmi/pubsub_pmi_component.c | 41 ++-------- opal/mca/common/pmi/common_pmi.c | 6 +- opal/mca/db/pmi/db_pmi.c | 20 ++--- orte/mca/ess/pmi/ess_pmi_component.c | 5 -- orte/mca/grpcomm/pmi/grpcomm_pmi_component.c | 5 -- orte/mca/grpcomm/pmi/grpcomm_pmi_module.c | 12 +-- 9 files changed, 73 insertions(+), 111 deletions(-) diff --git a/config/opal_check_pmi.m4 b/config/opal_check_pmi.m4 index d6c3d1b61b..e4d4120fc6 100644 --- a/config/opal_check_pmi.m4 +++ b/config/opal_check_pmi.m4 @@ -27,13 +27,11 @@ AC_DEFUN([OPAL_CHECK_PMI],[ [AC_HELP_STRING([--with-pmi], [Build PMI support (default: no)])], [], with_pmi=no) - AC_ARG_WITH([cray-pmi2-ext], - [AC_HELP_STRING([--with-cray-pmi-ext], - [Include Cray PMI2 extensions (default: no)])], - [], with_cray_pmi2_ext=no) opal_enable_pmi=0 - opal_use_cray_pmi2_ext=0 + opal_use_pmi2=0 + opal_pmi_rpath= + opal_have_slurm_pmi2=0 # save flags opal_check_pmi_$1_save_CPPFLAGS="$CPPFLAGS" @@ -48,42 +46,63 @@ AC_DEFUN([OPAL_CHECK_PMI],[ AC_MSG_CHECKING([if user requested PMI support]) AS_IF([test "$with_pmi" = "no"], [AC_MSG_RESULT([no]) - opal_use_cray_pmi2_ext=0 $3], [AC_MSG_RESULT([yes]) - AC_MSG_CHECKING([if PMI support installed]) + AC_MSG_CHECKING([if PMI or PMI2 support installed]) # cannot use OMPI_CHECK_PACKAGE as its backend header # support appends "include" to the path, which won't # work with slurm :-( AS_IF([test ! -z "$with_pmi" -a "$with_pmi" != "yes"], [AS_IF([test -d "$with_pmi/lib64"], [opal_check_pmi_$1_LDFLAGS="-L$with_pmi/lib64" - opal_check_pmi_$1_LIBS="-lpmi -Wl,-rpath=$with_pmi/lib64"], + opal_pmi_rpath="$with_pmi/lib64"], [opal_check_pmi_$1_LDFLAGS="-L$with_pmi/lib" - opal_check_pmi_$1_LIBS="-lpmi -Wl,-rpath=$with_pmi/lib"]) - AS_IF([test -f "$with_pmi/include/pmi.h"], - [opal_check_pmi_$1_CPPFLAGS="-I$with_pmi/include"], - [AS_IF([test -f "$with_pmi/include/slurm/pmi.h"], - [opal_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm"], + opal_pmi_rpath="$with_pmi/lib"]) + # default to using PMI-2 if it is present + AS_IF([test -f "$with_pmi/include/pmi2.h" -o -f "$with_pmi/include/pmi.h"], + [opal_check_pmi_$1_CPPFLAGS="-I$with_pmi/include" + AS_IF([test -f "$with_pmi/include/pmi2.h"], + [opal_use_pmi2=1 + AC_MSG_RESULT([PMI2 support found])], + [opal_use_pmi2=0 + AC_MSG_RESULT([PMI support found])])], + [AS_IF([test -f "$with_pmi/include/slurm/pmi2.h" -o -f "$with_pmi/include/slurm/pmi.h"], + [opal_check_pmi_$1_CPPFLAGS="-I$with_pmi/include/slurm" + AS_IF([test -f "$with_pmi/include/slurm/pmi2.h"], + [opal_use_pmi2=1 + opal_have_slurm_pmi2=1 + AC_MSG_RESULT([Slurm PMI2 support found])], + [opal_use_pmi2=0 + AC_MSG_RESULT([Slurm PMI support found])])], [AC_MSG_RESULT([not found]) - AC_MSG_WARN([PMI support requested (via --with-pmi) but pmi.h]) - AC_MSG_WARN([not found under locations:]) + AC_MSG_WARN([PMI support requested (via --with-pmi) but neither pmi.h]) + AC_MSG_WARN([nor pmi2.h were found under locations:]) AC_MSG_WARN([ $with_pmi/include]) AC_MSG_WARN([ $with_pmi/include/slurm]) AC_MSG_WARN([Specified path: $with_pmi]) AC_MSG_ERROR([Aborting]) - $3])])], - [AS_IF([test -f "/usr/include/slurm/pmi.h"], - [opal_check_pmi_$1_CPPFLAGS="-I/usr/include/slurm"])]) + $3])])]) + + AS_IF([test $opal_use_pmi2 = 1], + [AS_IF([test $opal_have_slurm_pmi2 = 1], + [ # slurm puts pmi2 into a separate lib + opal_check_pmi_$1_LIBS="-lpmi2 -lpmi -Wl,-rpath=$opal_pmi_rpath"], + [opal_check_pmi_$1_LIBS="-lpmi -Wl,-rpath=$opal_pmi_rpath"])], + [opal_check_pmi_$1_LIBS="-lpmi -Wl,-rpath=$opal_pmi_rpath"]) LDFLAGS="$LDFLAGS $opal_check_pmi_$1_LDFLAGS" CPPFLAGS="$CPPFLAGS $opal_check_pmi_$1_CPPFLAGS" LIBS="$LIBS $opal_check_pmi_$1_LIBS" opal_have_pmi_support=no - AC_CHECK_HEADERS([pmi.h], - [AC_CHECK_LIB([pmi], [PMI_Init], - [opal_have_pmi_support=yes])]) + AS_IF([test "$opal_use_pmi2" = "1"], + [AC_CHECK_HEADERS([pmi2.h], + [AC_CHECK_LIB([pmi2], [PMI2_Init], + [opal_have_pmi_support=yes])])], + [AC_CHECK_HEADERS([pmi.h], + [AC_CHECK_LIB([pmi], [PMI_Init], + [opal_have_pmi_support=yes])])]) + AC_MSG_CHECKING([PMI2 or PMI support enabled]) AS_IF([test "$opal_have_pmi_support" = "yes"], [AC_MSG_RESULT([yes]) opal_enable_pmi=1 @@ -95,24 +114,7 @@ AC_DEFUN([OPAL_CHECK_PMI],[ AC_MSG_WARN([PMI support requested (via --with-pmi) but not found.]) AC_MSG_ERROR([Aborting.]) $3]) - - AC_MSG_CHECKING([if user requested Cray PMI2 extensions]) - AS_IF([test "$with_cray_pmi2_ext" = "no"], - [AC_MSG_RESULT([no]) - opal_use_pmi2_ext=0], - [AC_MSG_RESULT([yes]) - # check to see if pmi2.h header is present. if it is, then we - # will use some of the functions in it. - AC_MSG_CHECKING([if PMI2 extensions installed]) - AS_IF([test -f "$with_pmi/include/pmi2.h"], - [opal_use_pmi2_ext=1 - AC_MSG_RESULT(yes)], - [AC_MSG_RESULT([no]) - AC_MSG_WARN([PMI2 extensions requested (via --with-cray-pmi2-ext) but not found.]) - AC_MSG_ERROR([Aborting.]) - opal_use_pmi2_ext=0 - opal_enable_pmi=0 - $3])])]) + ]) # restore flags - have to add CPPFLAGS so base functions can find pmi.h CPPFLAGS="$opal_check_pmi_$1_save_CPPFLAGS $opal_check_pmi_$1_CPPFLAGS" @@ -122,8 +124,8 @@ AC_DEFUN([OPAL_CHECK_PMI],[ AC_DEFINE_UNQUOTED([WANT_PMI_SUPPORT], [$opal_enable_pmi], [Whether we want PMI support]) - AC_DEFINE_UNQUOTED([WANT_CRAY_PMI2_EXT], - [$opal_use_pmi2_ext], - [Whether we want to use Cray PMI2 extensions]) + AC_DEFINE_UNQUOTED([WANT_PMI2_SUPPORT], + [$opal_use_pmi2], + [Whether we want to use PMI2]) AM_CONDITIONAL(WANT_PMI_SUPPORT, [test "$opal_enable_pmi" = 1]) ]) diff --git a/contrib/platform/lanl/cray_xe6/cray-common b/contrib/platform/lanl/cray_xe6/cray-common index 6331233d77..e043aaf70c 100644 --- a/contrib/platform/lanl/cray_xe6/cray-common +++ b/contrib/platform/lanl/cray_xe6/cray-common @@ -9,7 +9,6 @@ with_xpmem=/opt/cray/xpmem/0.1-2.0400.30792.5.6.gem # enable Cray PMI support with_pmi=/opt/cray/pmi/2.1.4-1.0000.8596.8.9.gem -with_cray_pmi2_ext=yes # enable ugni btl with_ugni=/opt/cray/ugni/2.3-1.0400.4127.5.20.gem diff --git a/ompi/mca/pubsub/pmi/pubsub_pmi.c b/ompi/mca/pubsub/pmi/pubsub_pmi.c index 1b30dca261..b3ebd4d67d 100644 --- a/ompi/mca/pubsub/pmi/pubsub_pmi.c +++ b/ompi/mca/pubsub/pmi/pubsub_pmi.c @@ -13,7 +13,7 @@ #include "ompi/constants.h" #include -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT #include #endif @@ -37,7 +37,7 @@ static int publish ( char *service_name, ompi_info_t *info, char *port_name ) { int rc; -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT if (PMI_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) { OMPI_ERROR_LOG(rc); return OMPI_ERROR; @@ -56,7 +56,7 @@ static char* lookup ( char *service_name, ompi_info_t *info ) char *port=NULL; int rc; -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */ if (PMI_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) { OMPI_ERROR_LOG(rc); @@ -78,7 +78,7 @@ static int unpublish ( char *service_name, ompi_info_t *info ) { int rc; -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT if (PMI_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) { OMPI_ERROR_LOG(rc); return OMPI_ERROR; diff --git a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c index 5a55decbdb..eaf2e4b560 100644 --- a/ompi/mca/pubsub/pmi/pubsub_pmi_component.c +++ b/ompi/mca/pubsub/pmi/pubsub_pmi_component.c @@ -12,10 +12,7 @@ #include "ompi_config.h" -#include -#if WANT_CRAY_PMI2_EXT -#include -#endif +#include "opal/mca/common/pmi/common_pmi.h" #include "ompi/constants.h" #include "ompi/mca/rte/rte.h" @@ -74,39 +71,13 @@ static int pubsub_pmi_component_close(void) static int pubsub_pmi_component_query(mca_base_module_t **module, int *priority) { /* for now, only use PMI when direct launched */ - if (NULL == ompi_process_info.my_hnp_uri) { - goto cleanup; + if (NULL != ompi_process_info.my_hnp_uri && + mca_common_pmi_init ()) { + *priority = my_priority; + *module = (mca_base_module_t *)&ompi_pubsub_pmi_module; + return OMPI_SUCCESS; } - -#if WANT_CRAY_PMI2_EXT - { - int spawned, size, rank, appnum; - if (PMI2_Initialized ()) return OMPI_SUCCESS; - if (PMI_SUCCESS != PMI2_Init(&spawned, &size, &rank, &appnum)) { - goto cleanup; - } - } -#else - { - PMI_BOOL initialized; - - if (PMI_SUCCESS != PMI_Initialized(&initialized)) { - goto cleanup; - } - - if (PMI_TRUE != initialized && PMI_SUCCESS != PMI_Init(&initialized)) { - goto cleanup; - } - } -#endif - - /* if PMI is available, use it */ - *priority = my_priority; - *module = (mca_base_module_t *)&ompi_pubsub_pmi_module; - return OMPI_SUCCESS; - - cleanup: /* we can't run */ *priority = -1; *module = NULL; diff --git a/opal/mca/common/pmi/common_pmi.c b/opal/mca/common/pmi/common_pmi.c index 2ee0993c36..df709a461c 100644 --- a/opal/mca/common/pmi/common_pmi.c +++ b/opal/mca/common/pmi/common_pmi.c @@ -18,7 +18,7 @@ #include #include -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT #include #endif @@ -31,7 +31,7 @@ bool mca_common_pmi_init (void) { return true; } -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT { int spawned, size, rank, appnum; @@ -70,7 +70,7 @@ void mca_common_pmi_finalize (void) { } if (0 == --mca_common_pmi_init_count) { -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT PMI2_Finalize (); #else PMI_Finalize (); diff --git a/opal/mca/db/pmi/db_pmi.c b/opal/mca/db/pmi/db_pmi.c index 30620c3699..0e46b22de1 100644 --- a/opal/mca/db/pmi/db_pmi.c +++ b/opal/mca/db/pmi/db_pmi.c @@ -15,7 +15,7 @@ #include #include #include -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT #include #endif @@ -80,7 +80,7 @@ static int pmi_keylen_max = -1; */ static int kvs_put(const char *key, const char *value) { -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT return PMI2_KVS_Put(key, value); #else return PMI_KVS_Put(pmi_kvs_name, key, value); @@ -89,7 +89,7 @@ static int kvs_put(const char *key, const char *value) static int kvs_get(const char *key, char *value, int valuelen) { -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT int len; return PMI2_KVS_Get(pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len); @@ -98,7 +98,7 @@ static int kvs_get(const char *key, char *value, int valuelen) #endif } -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT static char escape_char = '$'; static char *illegal = "/;="; static char *sub = "012"; @@ -156,7 +156,7 @@ static int store(const opal_identifier_t *uid, switch (type) { case OPAL_STRING: -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT { /* the blasted Cray PMI implementation marked a number of common * ASCII characters as "illegal", so if we are on one of those @@ -391,7 +391,7 @@ static char* fetch_string(const char *key) /* cleanup */ free(tmp_val); -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT { /* the blasted Cray PMI implementation marked a number of common * ASCII characters as "illegal", so if we are on one of those @@ -555,7 +555,7 @@ static int setup_pmi(void) { int max_length, rc; -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT pmi_vallen_max = PMI2_MAX_VALLEN; #else rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max); @@ -565,7 +565,7 @@ static int setup_pmi(void) } #endif -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT /* TODO -- is this ok */ max_length = 1024; #else @@ -579,7 +579,7 @@ static int setup_pmi(void) return OPAL_ERR_OUT_OF_RESOURCE; } -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT rc = PMI2_Job_GetId(pmi_kvs_name, max_length); #else rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length); @@ -589,7 +589,7 @@ static int setup_pmi(void) return OPAL_ERROR; } -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT pmi_keylen_max = PMI2_MAX_KEYLEN; #else if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) { diff --git a/orte/mca/ess/pmi/ess_pmi_component.c b/orte/mca/ess/pmi/ess_pmi_component.c index e7ad5a90ee..2e0a78bbc8 100644 --- a/orte/mca/ess/pmi/ess_pmi_component.c +++ b/orte/mca/ess/pmi/ess_pmi_component.c @@ -18,11 +18,6 @@ #include "orte_config.h" #include "orte/constants.h" -#include -#if WANT_CRAY_PMI2_EXT -#include -#endif - #include "opal/mca/common/pmi/common_pmi.h" #include "orte/util/proc_info.h" diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c index bc388aef32..274f39b92a 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_component.c @@ -13,11 +13,6 @@ #include "orte_config.h" #include "orte/constants.h" -#include -#if WANT_CRAY_PMI2_EXT -#include -#endif - #include "opal/mca/mca.h" #include "opal/mca/common/pmi/common_pmi.h" diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index 8847f2b121..1cebab59fd 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -18,7 +18,7 @@ #include #include -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT #include #endif @@ -65,7 +65,7 @@ static int init(void) { int max_length, rc; -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT /* TODO -- is this ok */ max_length = 1024; #else @@ -79,7 +79,7 @@ static int init(void) return ORTE_ERR_OUT_OF_RESOURCE; } -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT rc = PMI2_Job_GetId(pmi_kvs_name, max_length); #else rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length); @@ -136,8 +136,8 @@ static int pmi_barrier(orte_grpcomm_collective_t *coll) return ORTE_SUCCESS; } -#if WANT_CRAY_PMI2_EXT - /* Cray doesn't provide a barrier, so use the Fence function here */ +#if WANT_PMI2_SUPPORT + /* PMI2 doesn't provide a barrier, so use the Fence function here */ if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence"); return ORTE_ERROR; @@ -187,7 +187,7 @@ static int modex(orte_grpcomm_collective_t *coll) /* our RTE data was constructed and pushed in the ESS pmi component */ /* commit our modex info */ -#if WANT_CRAY_PMI2_EXT +#if WANT_PMI2_SUPPORT PMI2_KVS_Fence(); #else {