1
1

Fix singletons and ensure adequate PMIx version

OMPI can only support PMIx v3 and above. PRRTE requires at least PMIx
v4, so protect against the case where OMPI is built against an external
PMIx v3.

Fix check of PMIx_Init return code for singleton operations.

Ensure that the PMIx framework gets properly opened.

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-03-23 10:29:42 -07:00
родитель 973d10159a
Коммит 95dacd2086
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B63B630167D26BB5
6 изменённых файлов: 50 добавлений и 6 удалений

Просмотреть файл

@ -46,6 +46,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
[AC_HELP_STRING([--enable-prte-prefix-by-default], [AC_HELP_STRING([--enable-prte-prefix-by-default],
[Make "mpirun ..." behave exactly the same as "mpirun --prefix \$prefix" (where \$prefix is the value given to --prefix in configure) (default:enabled)])]) [Make "mpirun ..." behave exactly the same as "mpirun --prefix \$prefix" (where \$prefix is the value given to --prefix in configure) (default:enabled)])])
AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 4 && test "$enable_internal_rte" != "no"],
[AC_MSG_WARN([OMPI's internal runtime environment "PRRTE" does not support])
AC_MSG_WARN([PMIx versions less than v4.x as they lack adequate tool])
AC_MSG_WARN([support. You can, if desired, build OMPI against an earlier])
AC_MSG_WARN([version of PMIx for strictly direct-launch purposes - e.g., using)])
AC_MSG_WARN([Slurm's srun to launch the job - by configuring with the])
AC_MSG_WARN([--disable-internal-rte option.])
AC_MSG_ERROR([Cannot continue])])
AC_MSG_CHECKING([if RTE support is enabled]) AC_MSG_CHECKING([if RTE support is enabled])
if test "$enable_internal_rte" != "no"; then if test "$enable_internal_rte" != "no"; then
AC_MSG_RESULT([yes]) AC_MSG_RESULT([yes])
@ -81,7 +90,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
opal_prrte_prefix_arg= opal_prrte_prefix_arg=
fi fi
opal_prrte_args="--prefix=$prefix --disable-dlopen $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg" opal_prrte_args="--prefix=$prefix $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg"
AS_IF([test "$enable_debug" = "yes"], AS_IF([test "$enable_debug" = "yes"],
[opal_prrte_args="--enable-debug $opal_prrte_args" [opal_prrte_args="--enable-debug $opal_prrte_args"
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"], CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"],

Просмотреть файл

@ -125,6 +125,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
], [])], ], [])],
[AC_MSG_RESULT([found]) [AC_MSG_RESULT([found])
opal_external_pmix_version=4x opal_external_pmix_version=4x
opal_numerical_pmix_version=4
opal_external_pmix_version_found=1 opal_external_pmix_version_found=1
opal_external_pmix_happy=yes], opal_external_pmix_happy=yes],
[AC_MSG_RESULT([not found])])]) [AC_MSG_RESULT([not found])])])
@ -139,6 +140,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
], [])], ], [])],
[AC_MSG_RESULT([found]) [AC_MSG_RESULT([found])
opal_external_pmix_version=3x opal_external_pmix_version=3x
opal_numerical_pmix_version=3
opal_external_pmix_version_found=1 opal_external_pmix_version_found=1
opal_external_pmix_happy=yes], opal_external_pmix_happy=yes],
[AC_MSG_RESULT([not found])])]) [AC_MSG_RESULT([not found])])])
@ -153,6 +155,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
], [])], ], [])],
[AC_MSG_RESULT([found]) [AC_MSG_RESULT([found])
opal_external_pmix_version=2x opal_external_pmix_version=2x
opal_numerical_pmix_version=2
opal_external_pmix_version_found=1 opal_external_pmix_version_found=1
opal_external_pmix_happy=yes], opal_external_pmix_happy=yes],
[AC_MSG_RESULT([not found])])]) [AC_MSG_RESULT([not found])])])
@ -167,6 +170,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
], [])], ], [])],
[AC_MSG_RESULT([found]) [AC_MSG_RESULT([found])
opal_external_pmix_version=1x opal_external_pmix_version=1x
opal_numerical_pmix_version=1
opal_external_pmix_version_found=1 opal_external_pmix_version_found=1
opal_external_have_pmix1=1 opal_external_have_pmix1=1
opal_external_pmix_happy=yes], opal_external_pmix_happy=yes],
@ -179,6 +183,12 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
opal_external_pmix_happy=no]) opal_external_pmix_happy=no])
]) ])
AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 3],
[AC_MSG_WARN([OMPI no longer supports PMIx versions prior to v3])
AC_MSG_WARN([Please direct us to a more current PMIx release or])
AC_MSG_WARN([use the internally provided one])
AC_MSG_ERROR([Cannot continue])])
AS_IF([test "$opal_external_pmix_happy" = "yes"], AS_IF([test "$opal_external_pmix_happy" = "yes"],
[$3 [$3
# add the new flags to our wrapper compilers # add the new flags to our wrapper compilers

Просмотреть файл

@ -121,6 +121,10 @@ int ompi_interlib_declare(int threadlevel, char *version)
PMIX_INFO_DESTRUCT(&info[3]); PMIX_INFO_DESTRUCT(&info[3]);
/* account for our refcount on pmix_init */ /* account for our refcount on pmix_init */
PMIx_Finalize(NULL, 0); PMIx_Finalize(NULL, 0);
ret = opal_pmix_convert_status(rc); if (ompi_singleton && PMIX_ERR_UNREACH == rc) {
ret = OMPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
}
return ret; return ret;
} }

Просмотреть файл

@ -85,8 +85,8 @@ static void try_kill_peers(ompi_communicator_t *comm,
procs = (ompi_process_name_t*) calloc(nprocs, sizeof(ompi_process_name_t)); procs = (ompi_process_name_t*) calloc(nprocs, sizeof(ompi_process_name_t));
if (NULL == procs) { if (NULL == procs) {
/* quick clean orte and get out */ /* quick clean RTE and get out */
ompi_rte_abort(errno, "Abort: unable to alloc memory to kill procs"); ompi_rte_abort(errcode, "Abort: unable to alloc memory to kill procs");
} }
/* put all the local group procs in the abort list */ /* put all the local group procs in the abort list */

Просмотреть файл

@ -61,7 +61,27 @@
opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1}; opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1};
opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX}; opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
hwloc_cpuset_t ompi_proc_applied_binding = NULL; hwloc_cpuset_t ompi_proc_applied_binding = NULL;
pmix_process_info_t pmix_process_info = {0}; pmix_process_info_t pmix_process_info = {
.my_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
.nodename = NULL,
.pid = 0,
.top_session_dir = NULL,
.job_session_dir = NULL,
.proc_session_dir = NULL,
.my_local_rank = 0,
.my_node_rank = 0,
.num_local_peers = 0,
.num_procs = 0,
.app_num = 0,
.univ_size = 0,
.app_sizes = NULL,
.app_ldrs = NULL,
.cpuset = NULL,
.command = NULL,
.num_apps = 0,
.initial_wdir = NULL,
.reincarnation = 0
};
bool pmix_proc_is_bound = false; bool pmix_proc_is_bound = false;
bool ompi_singleton = false; bool ompi_singleton = false;

Просмотреть файл

@ -54,6 +54,7 @@
#include "opal/mca/installdirs/base/base.h" #include "opal/mca/installdirs/base/base.h"
#include "opal/mca/memory/base/base.h" #include "opal/mca/memory/base/base.h"
#include "opal/mca/patcher/base/base.h" #include "opal/mca/patcher/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/memcpy/base/base.h" #include "opal/mca/memcpy/base/base.h"
#include "opal/mca/hwloc/base/base.h" #include "opal/mca/hwloc/base/base.h"
#include "opal/mca/reachable/base/base.h" #include "opal/mca/reachable/base/base.h"
@ -630,7 +631,7 @@ opal_init_util(int* pargc, char*** pargv)
static mca_base_framework_t *opal_init_frameworks[] = { static mca_base_framework_t *opal_init_frameworks[] = {
&opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework, &opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework,
&opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework, &opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework,
&opal_shmem_base_framework, &opal_reachable_base_framework, &opal_shmem_base_framework, &opal_reachable_base_framework, &opal_pmix_base_framework,
NULL, NULL,
}; };