From 95dacd208639f86c72acfe9eeae350ef1687d90f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 23 Mar 2020 10:29:42 -0700 Subject: [PATCH] Fix singletons and ensure adequate PMIx version OMPI can only support PMIx v3 and above. PRRTE requires at least PMIx v4, so protect against the case where OMPI is built against an external PMIx v3. Fix check of PMIx_Init return code for singleton operations. Ensure that the PMIx framework gets properly opened. Signed-off-by: Ralph Castain --- config/ompi_setup_prrte.m4 | 11 ++++++++++- config/opal_check_pmi.m4 | 10 ++++++++++ ompi/interlib/interlib.c | 6 +++++- ompi/runtime/ompi_mpi_abort.c | 4 ++-- ompi/runtime/ompi_rte.c | 22 +++++++++++++++++++++- opal/runtime/opal_init.c | 3 ++- 6 files changed, 50 insertions(+), 6 deletions(-) diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 3871440a0f..a83bd618f7 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -46,6 +46,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ [AC_HELP_STRING([--enable-prte-prefix-by-default], [Make "mpirun ..." behave exactly the same as "mpirun --prefix \$prefix" (where \$prefix is the value given to --prefix in configure) (default:enabled)])]) + AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 4 && test "$enable_internal_rte" != "no"], + [AC_MSG_WARN([OMPI's internal runtime environment "PRRTE" does not support]) + AC_MSG_WARN([PMIx versions less than v4.x as they lack adequate tool]) + AC_MSG_WARN([support. You can, if desired, build OMPI against an earlier]) + AC_MSG_WARN([version of PMIx for strictly direct-launch purposes - e.g., using)]) + AC_MSG_WARN([Slurm's srun to launch the job - by configuring with the]) + AC_MSG_WARN([--disable-internal-rte option.]) + AC_MSG_ERROR([Cannot continue])]) + AC_MSG_CHECKING([if RTE support is enabled]) if test "$enable_internal_rte" != "no"; then AC_MSG_RESULT([yes]) @@ -81,7 +90,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ opal_prrte_prefix_arg= fi - opal_prrte_args="--prefix=$prefix --disable-dlopen $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg" + opal_prrte_args="--prefix=$prefix $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg" AS_IF([test "$enable_debug" = "yes"], [opal_prrte_args="--enable-debug $opal_prrte_args" CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"], diff --git a/config/opal_check_pmi.m4 b/config/opal_check_pmi.m4 index c5706f4bff..6f18c02cd1 100644 --- a/config/opal_check_pmi.m4 +++ b/config/opal_check_pmi.m4 @@ -125,6 +125,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=4x + opal_numerical_pmix_version=4 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -139,6 +140,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=3x + opal_numerical_pmix_version=3 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -153,6 +155,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=2x + opal_numerical_pmix_version=2 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -167,6 +170,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=1x + opal_numerical_pmix_version=1 opal_external_pmix_version_found=1 opal_external_have_pmix1=1 opal_external_pmix_happy=yes], @@ -179,6 +183,12 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ opal_external_pmix_happy=no]) ]) + AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 3], + [AC_MSG_WARN([OMPI no longer supports PMIx versions prior to v3]) + AC_MSG_WARN([Please direct us to a more current PMIx release or]) + AC_MSG_WARN([use the internally provided one]) + AC_MSG_ERROR([Cannot continue])]) + AS_IF([test "$opal_external_pmix_happy" = "yes"], [$3 # add the new flags to our wrapper compilers diff --git a/ompi/interlib/interlib.c b/ompi/interlib/interlib.c index 69062c5de3..da18b69419 100644 --- a/ompi/interlib/interlib.c +++ b/ompi/interlib/interlib.c @@ -121,6 +121,10 @@ int ompi_interlib_declare(int threadlevel, char *version) PMIX_INFO_DESTRUCT(&info[3]); /* account for our refcount on pmix_init */ PMIx_Finalize(NULL, 0); - ret = opal_pmix_convert_status(rc); + if (ompi_singleton && PMIX_ERR_UNREACH == rc) { + ret = OMPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + } return ret; } diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c index a550e7b9f6..a42109b5de 100644 --- a/ompi/runtime/ompi_mpi_abort.c +++ b/ompi/runtime/ompi_mpi_abort.c @@ -85,8 +85,8 @@ static void try_kill_peers(ompi_communicator_t *comm, procs = (ompi_process_name_t*) calloc(nprocs, sizeof(ompi_process_name_t)); if (NULL == procs) { - /* quick clean orte and get out */ - ompi_rte_abort(errno, "Abort: unable to alloc memory to kill procs"); + /* quick clean RTE and get out */ + ompi_rte_abort(errcode, "Abort: unable to alloc memory to kill procs"); } /* put all the local group procs in the abort list */ diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index 66a2db21ed..d459024231 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -61,7 +61,27 @@ opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1}; opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX}; hwloc_cpuset_t ompi_proc_applied_binding = NULL; -pmix_process_info_t pmix_process_info = {0}; +pmix_process_info_t pmix_process_info = { + .my_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}, + .nodename = NULL, + .pid = 0, + .top_session_dir = NULL, + .job_session_dir = NULL, + .proc_session_dir = NULL, + .my_local_rank = 0, + .my_node_rank = 0, + .num_local_peers = 0, + .num_procs = 0, + .app_num = 0, + .univ_size = 0, + .app_sizes = NULL, + .app_ldrs = NULL, + .cpuset = NULL, + .command = NULL, + .num_apps = 0, + .initial_wdir = NULL, + .reincarnation = 0 +}; bool pmix_proc_is_bound = false; bool ompi_singleton = false; diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index ab17e8e9bb..09e10a4fd5 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -54,6 +54,7 @@ #include "opal/mca/installdirs/base/base.h" #include "opal/mca/memory/base/base.h" #include "opal/mca/patcher/base/base.h" +#include "opal/mca/pmix/base/base.h" #include "opal/mca/memcpy/base/base.h" #include "opal/mca/hwloc/base/base.h" #include "opal/mca/reachable/base/base.h" @@ -630,7 +631,7 @@ opal_init_util(int* pargc, char*** pargv) static mca_base_framework_t *opal_init_frameworks[] = { &opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework, &opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework, - &opal_shmem_base_framework, &opal_reachable_base_framework, + &opal_shmem_base_framework, &opal_reachable_base_framework, &opal_pmix_base_framework, NULL, };