From 95dacd208639f86c72acfe9eeae350ef1687d90f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 23 Mar 2020 10:29:42 -0700 Subject: [PATCH 1/3] Fix singletons and ensure adequate PMIx version OMPI can only support PMIx v3 and above. PRRTE requires at least PMIx v4, so protect against the case where OMPI is built against an external PMIx v3. Fix check of PMIx_Init return code for singleton operations. Ensure that the PMIx framework gets properly opened. Signed-off-by: Ralph Castain --- config/ompi_setup_prrte.m4 | 11 ++++++++++- config/opal_check_pmi.m4 | 10 ++++++++++ ompi/interlib/interlib.c | 6 +++++- ompi/runtime/ompi_mpi_abort.c | 4 ++-- ompi/runtime/ompi_rte.c | 22 +++++++++++++++++++++- opal/runtime/opal_init.c | 3 ++- 6 files changed, 50 insertions(+), 6 deletions(-) diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 3871440a0f..a83bd618f7 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -46,6 +46,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ [AC_HELP_STRING([--enable-prte-prefix-by-default], [Make "mpirun ..." behave exactly the same as "mpirun --prefix \$prefix" (where \$prefix is the value given to --prefix in configure) (default:enabled)])]) + AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 4 && test "$enable_internal_rte" != "no"], + [AC_MSG_WARN([OMPI's internal runtime environment "PRRTE" does not support]) + AC_MSG_WARN([PMIx versions less than v4.x as they lack adequate tool]) + AC_MSG_WARN([support. You can, if desired, build OMPI against an earlier]) + AC_MSG_WARN([version of PMIx for strictly direct-launch purposes - e.g., using)]) + AC_MSG_WARN([Slurm's srun to launch the job - by configuring with the]) + AC_MSG_WARN([--disable-internal-rte option.]) + AC_MSG_ERROR([Cannot continue])]) + AC_MSG_CHECKING([if RTE support is enabled]) if test "$enable_internal_rte" != "no"; then AC_MSG_RESULT([yes]) @@ -81,7 +90,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ opal_prrte_prefix_arg= fi - opal_prrte_args="--prefix=$prefix --disable-dlopen $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg" + opal_prrte_args="--prefix=$prefix $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg" AS_IF([test "$enable_debug" = "yes"], [opal_prrte_args="--enable-debug $opal_prrte_args" CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"], diff --git a/config/opal_check_pmi.m4 b/config/opal_check_pmi.m4 index c5706f4bff..6f18c02cd1 100644 --- a/config/opal_check_pmi.m4 +++ b/config/opal_check_pmi.m4 @@ -125,6 +125,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=4x + opal_numerical_pmix_version=4 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -139,6 +140,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=3x + opal_numerical_pmix_version=3 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -153,6 +155,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=2x + opal_numerical_pmix_version=2 opal_external_pmix_version_found=1 opal_external_pmix_happy=yes], [AC_MSG_RESULT([not found])])]) @@ -167,6 +170,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ ], [])], [AC_MSG_RESULT([found]) opal_external_pmix_version=1x + opal_numerical_pmix_version=1 opal_external_pmix_version_found=1 opal_external_have_pmix1=1 opal_external_pmix_happy=yes], @@ -179,6 +183,12 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[ opal_external_pmix_happy=no]) ]) + AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 3], + [AC_MSG_WARN([OMPI no longer supports PMIx versions prior to v3]) + AC_MSG_WARN([Please direct us to a more current PMIx release or]) + AC_MSG_WARN([use the internally provided one]) + AC_MSG_ERROR([Cannot continue])]) + AS_IF([test "$opal_external_pmix_happy" = "yes"], [$3 # add the new flags to our wrapper compilers diff --git a/ompi/interlib/interlib.c b/ompi/interlib/interlib.c index 69062c5de3..da18b69419 100644 --- a/ompi/interlib/interlib.c +++ b/ompi/interlib/interlib.c @@ -121,6 +121,10 @@ int ompi_interlib_declare(int threadlevel, char *version) PMIX_INFO_DESTRUCT(&info[3]); /* account for our refcount on pmix_init */ PMIx_Finalize(NULL, 0); - ret = opal_pmix_convert_status(rc); + if (ompi_singleton && PMIX_ERR_UNREACH == rc) { + ret = OMPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + } return ret; } diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c index a550e7b9f6..a42109b5de 100644 --- a/ompi/runtime/ompi_mpi_abort.c +++ b/ompi/runtime/ompi_mpi_abort.c @@ -85,8 +85,8 @@ static void try_kill_peers(ompi_communicator_t *comm, procs = (ompi_process_name_t*) calloc(nprocs, sizeof(ompi_process_name_t)); if (NULL == procs) { - /* quick clean orte and get out */ - ompi_rte_abort(errno, "Abort: unable to alloc memory to kill procs"); + /* quick clean RTE and get out */ + ompi_rte_abort(errcode, "Abort: unable to alloc memory to kill procs"); } /* put all the local group procs in the abort list */ diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index 66a2db21ed..d459024231 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -61,7 +61,27 @@ opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1}; opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX}; hwloc_cpuset_t ompi_proc_applied_binding = NULL; -pmix_process_info_t pmix_process_info = {0}; +pmix_process_info_t pmix_process_info = { + .my_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}, + .nodename = NULL, + .pid = 0, + .top_session_dir = NULL, + .job_session_dir = NULL, + .proc_session_dir = NULL, + .my_local_rank = 0, + .my_node_rank = 0, + .num_local_peers = 0, + .num_procs = 0, + .app_num = 0, + .univ_size = 0, + .app_sizes = NULL, + .app_ldrs = NULL, + .cpuset = NULL, + .command = NULL, + .num_apps = 0, + .initial_wdir = NULL, + .reincarnation = 0 +}; bool pmix_proc_is_bound = false; bool ompi_singleton = false; diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index ab17e8e9bb..09e10a4fd5 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -54,6 +54,7 @@ #include "opal/mca/installdirs/base/base.h" #include "opal/mca/memory/base/base.h" #include "opal/mca/patcher/base/base.h" +#include "opal/mca/pmix/base/base.h" #include "opal/mca/memcpy/base/base.h" #include "opal/mca/hwloc/base/base.h" #include "opal/mca/reachable/base/base.h" @@ -630,7 +631,7 @@ opal_init_util(int* pargc, char*** pargv) static mca_base_framework_t *opal_init_frameworks[] = { &opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework, &opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework, - &opal_shmem_base_framework, &opal_reachable_base_framework, + &opal_shmem_base_framework, &opal_reachable_base_framework, &opal_pmix_base_framework, NULL, }; From a608e053a6cf3a6dd6794cbe2f930d9cd1200321 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 23 Mar 2020 11:18:13 -0700 Subject: [PATCH 2/3] Silence compiler warning Signed-off-by: Ralph Castain --- opal/mca/rcache/grdma/rcache_grdma_module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/rcache/grdma/rcache_grdma_module.c b/opal/mca/rcache/grdma/rcache_grdma_module.c index c4e72c0041..3f163c2d7f 100644 --- a/opal/mca/rcache/grdma/rcache_grdma_module.c +++ b/opal/mca/rcache/grdma/rcache_grdma_module.c @@ -187,7 +187,7 @@ static inline mca_rcache_base_registration_t *mca_rcache_grdma_remove_lru_head(m /* registration has been selected for removal and is no longer in the LRU. mark it * as such. */ new_flags = (old_flags & ~MCA_RCACHE_GRDMA_REG_FLAG_IN_LRU) | MCA_RCACHE_FLAGS_INVALID; - if (opal_atomic_compare_exchange_strong_32(&old_reg->flags, &old_flags, new_flags)) { + if (opal_atomic_compare_exchange_strong_32((opal_atomic_int32_t*)&old_reg->flags, &old_flags, new_flags)) { break; } } while (1); From 43f79be2e319fb6d894037129fe371a59294be56 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 23 Mar 2020 11:18:23 -0700 Subject: [PATCH 3/3] Update PMIx and PRRTE Fix singleton operations and ensure notification upon tool connection. Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix4x/openpmix | 2 +- prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/pmix/pmix4x/openpmix b/opal/mca/pmix/pmix4x/openpmix index 98d14d55f8..a18e531382 160000 --- a/opal/mca/pmix/pmix4x/openpmix +++ b/opal/mca/pmix/pmix4x/openpmix @@ -1 +1 @@ -Subproject commit 98d14d55f8d4bd27fe6eb1e508c336702e1fbf76 +Subproject commit a18e53138298d61a01fec4471518140304539e8c diff --git a/prrte b/prrte index 9add90bcfe..cdea523117 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 9add90bcfe88af1994914a78544d6236327be10e +Subproject commit cdea5231171b2fdea11269033de9e265fc7f3a63