From 1cf972dcaf64d31e2892360cf8435bb88b19c566 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 25 Mar 2020 15:43:27 -0700 Subject: [PATCH 1/4] Update PMIx and PRRTE Deprecate --am and --amca options Avoid default param files on backend nodes Any parameters in the PRRTE default or user param files will have been picked up by prte and included in the environment sent to the prted, so don't open those files on the backend. Avoid picking up MCA param file info on backend Avoid the scaling problem at PRRTE startup by only reading the system and user param files on the frontend. Complete revisions to cmd line parser for OMPI Per specification, enforce following precedence order: 1. system-level default parameter file 1. user-level default parameter file 1. Anything found in the environment 1. "--tune" files. Note that "--amca" goes away and becomes equivalent to "--tune". Okay if it is provided more than once on a cmd line (we will aggregate the list of files, retaining order), but an error if a parameter is referenced in more than one file with a different value 1. "--mca" options. Again, error if the same option appears more than once with a different value. Allowed to override a parameter referenced in a "tune" file 1. "-x" options. Allowed to overwrite options given in a "tune" file, but cannot conflict with an explicit "--mca" option 1. all other options Fix special handling of "-np" Get agreement on jobid across the layers Need all three pieces (PRRTE, PMIx, and OPAL) to agree on the nspace conversion to jobid method Ensure prte show_help messages get output Print abnormal termination messages Cleanup error reporting in persistent operations Signed-off-by: Ralph Castain dd Signed-off-by: Ralph Castain --- opal/mca/pmix/base/pmix_base_fns.c | 78 +++++++++++++++++++----------- opal/mca/pmix/pmix-internal.h | 6 +++ opal/mca/pmix/pmix4x/openpmix | 2 +- prrte | 2 +- 4 files changed, 58 insertions(+), 30 deletions(-) diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index 7eeac64e9b..fa9b446a8f 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -110,18 +110,14 @@ int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid) /* zero out the nspace */ PMIX_LOAD_NSPACE(nspace, NULL); - if (opal_process_info.nativelaunch) { - opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid); - return OPAL_SUCCESS; - } else { - /* cycle across our list of known jobids */ - OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { - if (jobid == nptr->jobid) { - PMIX_LOAD_NSPACE(nspace, nptr->nspace); - return OPAL_SUCCESS; - } + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (jobid == nptr->jobid) { + PMIX_LOAD_NSPACE(nspace, nptr->nspace); + return OPAL_SUCCESS; } } + return OPAL_ERR_NOT_FOUND; } @@ -129,29 +125,55 @@ int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace) { opal_nptr_t *nptr; opal_jobid_t jid; + uint16_t jobfam; + uint32_t hash32, localjob = 0; + char *p = NULL; /* set a default */ *jobid = OPAL_JOBID_INVALID; - if (opal_process_info.nativelaunch) { - return opal_convert_string_to_jobid(jobid, nspace); - } else { - /* cycle across our list of known jobids */ - OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { - if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) { - *jobid = nptr->jobid; - return OPAL_SUCCESS; - } - } - /* if we get here, we don't know this nspace */ - OPAL_HASH_STR(nspace, jid); - jid &= ~(0x8000); - *jobid = jid; - nptr = OBJ_NEW(opal_nptr_t); - nptr->jobid = jid; - PMIX_LOAD_NSPACE(nptr->nspace, nspace); - opal_list_append(&localnspaces, &nptr->super); + /* if the nspace is empty, there is nothing more to do */ + if (0 == strlen(nspace)) { + return OPAL_SUCCESS; } + if (NULL != strstr(nspace, "JOBID_WILDCARD")) { + *jobid = OPAL_JOBID_WILDCARD; + return OPAL_SUCCESS; + } + if (NULL != strstr(nspace, "JOBID_INVALID")) { + *jobid = OPAL_JOBID_INVALID; + return OPAL_SUCCESS; + } + + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) { + *jobid = nptr->jobid; + return OPAL_SUCCESS; + } + } + /* if we get here, we don't know this nspace */ + /* find the "." at the end that indicates the child job */ + if (NULL != (p = strrchr(nspace, '.'))) { + *p = '\0'; + } + OPAL_HASH_STR(nspace, hash32); + if (NULL != p) { + *p = '.'; + ++p; + localjob = strtoul(p, NULL, 10); + } + + /* now compress to 16-bits */ + jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32)); + jid = (0xffff0000 & ((uint32_t)jobfam << 16)) | (0x0000ffff & localjob); + *jobid = jid; + /* save this jobid/nspace pair */ + nptr = OBJ_NEW(opal_nptr_t); + nptr->jobid = jid; + PMIX_LOAD_NSPACE(nptr->nspace, nspace); + opal_list_append(&localnspaces, &nptr->super); + return OPAL_SUCCESS; } diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h index d8f8dd0cfb..12a7d670c4 100644 --- a/opal/mca/pmix/pmix-internal.h +++ b/opal/mca/pmix/pmix-internal.h @@ -595,9 +595,11 @@ OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t ns OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void); OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); +/* convert jobid to nspace */ #define OPAL_PMIX_CONVERT_JOBID(n, j) \ opal_pmix_convert_jobid((n), (j)) +/* convert vpid to rank */ #define OPAL_PMIX_CONVERT_VPID(r, v) \ do { \ if (OPAL_VPID_WILDCARD == (v)) { \ @@ -607,6 +609,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } \ } while(0) +/* convert opal_process_name_t to pmix_proc_t */ #define OPAL_PMIX_CONVERT_NAME(p, n) \ do { \ OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \ @@ -614,9 +617,11 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } while(0) +/* convert nspace to jobid */ #define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \ (r) = opal_pmix_convert_nspace((j), (n)) +/* convert pmix rank to opal vpid */ #define OPAL_PMIX_CONVERT_RANK(v, r) \ do { \ if (PMIX_RANK_WILDCARD == (r)) { \ @@ -628,6 +633,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } \ } while(0) +/* convert pmix_proc_t to opal_process_name_t */ #define OPAL_PMIX_CONVERT_PROCT(r, n, p) \ do { \ OPAL_PMIX_CONVERT_NSPACE((r), &(n)->jobid, (p)->nspace); \ diff --git a/opal/mca/pmix/pmix4x/openpmix b/opal/mca/pmix/pmix4x/openpmix index a18e531382..4c62a26b31 160000 --- a/opal/mca/pmix/pmix4x/openpmix +++ b/opal/mca/pmix/pmix4x/openpmix @@ -1 +1 @@ -Subproject commit a18e53138298d61a01fec4471518140304539e8c +Subproject commit 4c62a26b319ba78feadc42679200e93041f611a2 diff --git a/prrte b/prrte index cdea523117..8d673047b3 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit cdea5231171b2fdea11269033de9e265fc7f3a63 +Subproject commit 8d673047b325a148f55c65e049aab67f1de1d318 From f88f2710541a9fa089c3d2ed518014e083ecc89a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 29 Mar 2020 11:58:43 -0700 Subject: [PATCH 2/4] Cleanup few errors associated with tool support Properly mark/detect that a daemon sourced the event broadcast to avoid reinjecting it into the PMIx server library. Correct the source field for the event notify call on launcher ready. Update event notification for tool support Deal with a variety of race conditions related to tool reconnection to a different server. Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix4x/openpmix | 2 +- prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/pmix/pmix4x/openpmix b/opal/mca/pmix/pmix4x/openpmix index 4c62a26b31..8c565209c2 160000 --- a/opal/mca/pmix/pmix4x/openpmix +++ b/opal/mca/pmix/pmix4x/openpmix @@ -1 +1 @@ -Subproject commit 4c62a26b319ba78feadc42679200e93041f611a2 +Subproject commit 8c565209c21f93d11e2156c0d53d73c3f6f9aaab diff --git a/prrte b/prrte index 8d673047b3..7f82facd41 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 8d673047b325a148f55c65e049aab67f1de1d318 +Subproject commit 7f82facd41f55f49a70dc7096c668b4f38497241 From 1aabbe456d5de8fc3c3d6f91eabb8851db7d63eb Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 30 Mar 2020 16:06:40 -0700 Subject: [PATCH 3/4] Add extra libs to PRRTE binaries for external deps libevent, hwloc, and pmix can be external and may require that their libs be explicitly linked into the PRRTE binaries Signed-off-by: Ralph Castain --- config/ompi_setup_prrte.m4 | 42 +++++++++++++++++++------------------- prrte | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index a83bd618f7..b814bde1b1 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -24,7 +24,7 @@ # AC_DEFUN([OMPI_SETUP_PRRTE],[ - OPAL_VAR_SCOPE_PUSH([opal_prrte_save_CPPFLAGS opal_prrte_save_CFLAGS opal_prrte_save_LDFLAGS opal_prrte_save_LIBS opal_prrte_args opal_prrte_save_enable_dlopen opal_prrte_save_enable_mca_dso opal_prrte_save_enable_mca_static]) + OPAL_VAR_SCOPE_PUSH([opal_prrte_save_CPPFLAGS opal_prrte_save_CFLAGS opal_prrte_save_LDFLAGS opal_prrte_save_LIBS opal_prrte_args opal_prrte_save_enable_dlopen opal_prrte_save_enable_mca_dso opal_prrte_save_enable_mca_static opal_prrte_extra_libs opal_prrte_extra_ltlibs opal_prrte_extra_ldflags]) opal_prrte_save_CFLAGS=$CFLAGS opal_prrte_save_CPPFLAGS=$CPPFLAGS @@ -59,29 +59,29 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ if test "$enable_internal_rte" != "no"; then AC_MSG_RESULT([yes]) ompi_want_prrte=yes - if test -z $with_libevent || test "$with_libevent" = "internal" || test "$with_libevent" = "yes"; then - opal_prrte_libevent_arg="--with-libevent-header=$OMPI_TOP_SRCDIR/opal/mca/event/event.h" - elif test "$with_libevent" = "external"; then - opal_prrte_libevent_arg="" - else - opal_prrte_libevent_arg="--with-libevent=$with_libevent" - fi + opal_prrte_extra_libs=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la + opal_prrte_extra_ltlibs=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la - if test -z $with_hwloc || test "$with_hwloc" = "internal" || test "$with_hwloc" = "yes"; then - opal_prrte_hwloc_arg="--with-hwloc-header=$OMPI_TOP_SRCDIR/opal/mca/hwloc/hwloc-internal.h" - elif test "$with_hwloc" = "external"; then - opal_prrte_hwloc_arg="" - else - opal_prrte_hwloc_arg="--with-hwloc=$with_hwloc" + if test "$opal_event_external_support" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_event_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_event_external_LIBS" fi + # specifying --with-libevent-header causes prrte to ignore the with_libevent and with_libevent_libdir options + opal_prrte_libevent_arg="--with-libevent-header=$OMPI_TOP_SRCDIR/opal/mca/event/event.h" - if test -z $with_pmix || test "$with_pmix" = "internal" || test "$with_pmix" = "yes"; then - opal_prrte_pmix_arg="--with-pmix-header=$OMPI_TOP_SRCDIR/opal/mca/pmix/pmix-internal.h" - elif test "$with_pmix" = "external"; then - opal_prrte_pmix_arg="" - else - opal_prrte_pmix_arg="--with-pmix=$with_pmix" + if test "$opal_hwloc_external_support" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_hwloc_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_hwloc_external_LIBS" fi + # specifying --with-hwloc-header causes prrte to ignore the with_hwloc and with_hwloc_libdir options + opal_prrte_hwloc_arg="--with-hwloc-header=$OMPI_TOP_SRCDIR/opal/mca/hwloc/hwloc-internal.h" + + if test "$opal_external_pmix_happy" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_pmix_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_pmix_external_LIBS" + fi + # specifying --with-pmix-header causes prrte to ignore the with_pmix and with_pmix_libdir options + opal_prrte_pmix_arg="--with-pmix-header=$OMPI_TOP_SRCDIR/opal/mca/pmix/pmix-internal.h" if test -z $enable_prte_prefix_by_default || test "$enable_prte_prefix_by_default" = "yes" || test "$enable_orterun_prefix_given" = "yes"; then @@ -102,7 +102,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ opal_prrte_args="$opal_prrte_args --with-platform=$with_prrte_platform" fi # add the extra libs - opal_prrte_args="$opal_prrte_args --with-prrte-extra-lib=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la --with-prrte-extra-ltlib=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la" + opal_prrte_args="$opal_prrte_args --with-prrte-extra-lib=\"$opal_prrte_extra_libs\" --with-prrte-extra-ltlib=\"$opal_prrte_extra_ltlibs\"" AC_MSG_CHECKING([final prrte configure args]) AC_MSG_RESULT([$opal_prrte_args]) diff --git a/prrte b/prrte index 7f82facd41..6bab23ee55 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 7f82facd41f55f49a70dc7096c668b4f38497241 +Subproject commit 6bab23ee556e7d30586c951808a340cd0f787989 From 556b3fcc00bade0ed34044929f2aca954a2adf2d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 31 Mar 2020 07:03:40 -0700 Subject: [PATCH 4/4] PRRTE: Return non-zero status on timeout Signed-off-by: Ralph Castain --- prrte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prrte b/prrte index 6bab23ee55..d879d56693 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 6bab23ee556e7d30586c951808a340cd0f787989 +Subproject commit d879d5669379ffbe093d60b76b46cc9b2aae20e1