From 829fd478b3916c142e73e23619b4f0d1c529e1b3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 20 Feb 2020 12:50:20 -0800 Subject: [PATCH] Create a hack to protect against non-integer jobids If someone gives us a namespace that doesn't easily translate to an integer, we have to create a mechanism for working around the disconnect. PRRTE has been updated to give us a flag so we know we were "natively" launched. If we don't see it, then fall back to generating a hash of the nspace as our jobid. We then have to translate back/forth between nspace and jobid using a lookup table. Probably not the right long-term solution, but hopefully helps get us thru for a bit. Includes update of PRRTE pointer Signed-off-by: Ralph Castain --- ompi/runtime/ompi_rte.c | 18 +++++-- opal/mca/pmix/base/pmix_base_fns.c | 77 ++++++++++++++++++++++++++++++ opal/mca/pmix/pmix-internal.h | 27 +++++++---- opal/util/proc.c | 1 + opal/util/proc.h | 2 +- prrte | 2 +- 6 files changed, 111 insertions(+), 16 deletions(-) diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index dfdd4a6444..e1e95ba0b1 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -65,6 +65,8 @@ pmix_process_info_t pmix_process_info = {0}; bool pmix_proc_is_bound = false; bool ompi_singleton = false; +static pmix_proc_t myprocid; + static bool added_transport_keys = false; static bool added_num_procs = false; static bool added_app_ctx = false; @@ -498,7 +500,7 @@ int ompi_rte_init(int *pargc, char ***pargv) int ret; char *error = NULL; opal_process_name_t pname; - pmix_proc_t myproc, rproc; + pmix_proc_t rproc; int u32, *u32ptr; uint16_t u16, *u16ptr; char **peers=NULL; @@ -530,8 +532,11 @@ int ompi_rte_init(int *pargc, char ***pargv) goto error; } + /* setup our internal nspace hack */ + opal_pmix_setup_nspace_tracker(); + /* initialize the selected module */ - if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myproc, NULL, 0)))) { + if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) { /* we cannot run - this could be due to being direct launched * without the required PMI support being built, so print * out a help message indicating it */ @@ -539,8 +544,8 @@ int ompi_rte_init(int *pargc, char ***pargv) return OPAL_ERR_SILENT; } - /* setup the process name fields */ - OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myproc); + /* setup the process name fields - also registers the new nspace */ + OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid); if (OPAL_SUCCESS != rc) { return rc; } @@ -548,6 +553,7 @@ int ompi_rte_init(int *pargc, char ***pargv) OPAL_PROC_MY_NAME.vpid = pname.vpid; pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid; pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid; + /* set our hostname */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME, (char**)&ev1, PMIX_STRING); @@ -828,6 +834,10 @@ int ompi_rte_finalize(void) free (pmix_process_info.cpuset); pmix_process_info.cpuset = NULL; + /* cleanup our internal nspace hack */ + opal_pmix_finalize_nspace_tracker(); + + return OMPI_SUCCESS; } diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index ab49962ede..8d26993342 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -77,6 +77,83 @@ int opal_pmix_base_exchange(pmix_info_t *indat, return opal_pmix_convert_status(rc); } +typedef struct { + opal_list_item_t super; + pmix_nspace_t nspace; + opal_jobid_t jobid; +} opal_nptr_t; +static OBJ_CLASS_INSTANCE(opal_nptr_t, + opal_list_item_t, + NULL, NULL); + +static opal_list_t localnspaces; + +void opal_pmix_setup_nspace_tracker(void) +{ + /* check if we were launched by PRRTE */ + if (NULL != getenv("PRRTE_LAUNCHED")) { + opal_process_info.nativelaunch = true; + } + + OBJ_CONSTRUCT(&localnspaces, opal_list_t); +} + +void opal_pmix_finalize_nspace_tracker(void) +{ + OPAL_LIST_DESTRUCT(&localnspaces); +} + +int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid) +{ + opal_nptr_t *nptr; + + /* zero out the nspace */ + PMIX_LOAD_NSPACE(nspace, NULL); + + if (opal_process_info.nativelaunch) { + opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid); + return OPAL_SUCCESS; + } else { + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (jobid == nptr->jobid) { + PMIX_LOAD_NSPACE(nspace, nptr->nspace); + return OPAL_SUCCESS; + } + } + } + return OPAL_ERR_NOT_FOUND; +} + +int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace) +{ + opal_nptr_t *nptr; + opal_jobid_t jid; + + /* set a default */ + *jobid = OPAL_JOBID_INVALID; + + if (opal_process_info.nativelaunch) { + return opal_convert_string_to_jobid(jobid, nspace); + } else { + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) { + *jobid = nptr->jobid; + return OPAL_SUCCESS; + } + } + /* if we get here, we don't know this nspace */ + OPAL_HASH_STR(nspace, jid); + *jobid = jid; + nptr = OBJ_NEW(opal_nptr_t); + nptr->jobid = jid; + PMIX_LOAD_NSPACE(nptr->nspace, nspace); + opal_list_append(&localnspaces, &nptr->super); + } + return OPAL_SUCCESS; +} + pmix_status_t opal_pmix_convert_rc(int rc) { switch (rc) { diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h index 3094c4d8aa..20c4d22af2 100644 --- a/opal/mca/pmix/pmix-internal.h +++ b/opal/mca/pmix/pmix-internal.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -442,9 +442,13 @@ OPAL_DECLSPEC pmix_proc_state_t opal_pmix_convert_state(int state); OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t); OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc); OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status); +OPAL_DECLSPEC int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid); +OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace); +OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void); +OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); #define OPAL_PMIX_CONVERT_JOBID(n, j) \ - (void)opal_snprintf_jobid((n), PMIX_MAX_NSLEN, (j)) + opal_pmix_convert_jobid((n), (j)) #define OPAL_PMIX_CONVERT_VPID(r, v) \ do { \ @@ -454,6 +458,7 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status); (r) = (v); \ } \ } while(0) + #define OPAL_PMIX_CONVERT_NAME(p, n) \ do { \ OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \ @@ -462,15 +467,17 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status); #define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \ - (r) = opal_convert_string_to_jobid((j), (n)) + (r) = opal_pmix_convert_nspace((j), (n)) -#define OPAL_PMIX_CONVERT_RANK(v, r) \ - do { \ - if (PMIX_RANK_WILDCARD == (r)) { \ - (v) = OPAL_VPID_WILDCARD; \ - } else { \ - (v) = (r); \ - } \ +#define OPAL_PMIX_CONVERT_RANK(v, r) \ + do { \ + if (PMIX_RANK_WILDCARD == (r)) { \ + (v) = OPAL_VPID_WILDCARD; \ + } else if (PMIX_RANK_INVALID == (r)) { \ + (v) = OPAL_VPID_INVALID; \ + } else { \ + (v) = (r); \ + } \ } while(0) #define OPAL_PMIX_CONVERT_PROCT(r, n, p) \ diff --git a/opal/util/proc.c b/opal/util/proc.c index e94d24ee75..d1f4d84aa5 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -28,6 +28,7 @@ opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCAR opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}; opal_process_info_t opal_process_info = { + .nativelaunch = false, .nodename = NULL, .top_session_dir = NULL, .job_session_dir = NULL, diff --git a/opal/util/proc.h b/opal/util/proc.h index 6384a7f304..bd54dcb7ec 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -23,7 +23,6 @@ #include "opal/types.h" #include "opal/dss/dss.h" - #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #include #endif @@ -105,6 +104,7 @@ typedef struct { OBJ_CLASS_DECLARATION(opal_namelist_t); typedef struct opal_process_info_t { + bool nativelaunch; /**< launched by mpirun */ char *nodename; /**< string name for this node */ char *top_session_dir; /**< Top-level session directory */ char *job_session_dir; /**< Session directory for job */ diff --git a/prrte b/prrte index 53296629f9..c2e2231cc4 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 53296629f9aae70a6cd2586c77306a499e5b573a +Subproject commit c2e2231cc47c3df0fb3e40c130a9fecd1ca5cacf