1
1

Create a hack to protect against non-integer jobids

If someone gives us a namespace that doesn't easily translate to an
integer, we have to create a mechanism for working around the
disconnect. PRRTE has been updated to give us a flag so we know we were
"natively" launched. If we don't see it, then fall back to generating a
hash of the nspace as our jobid. We then have to translate back/forth
between nspace and jobid using a lookup table.

Probably not the right long-term solution, but hopefully helps get us
thru for a bit.

Includes update of PRRTE pointer

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-02-20 12:50:20 -08:00
родитель dd5991f513
Коммит 829fd478b3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B63B630167D26BB5
6 изменённых файлов: 111 добавлений и 16 удалений

Просмотреть файл

@ -65,6 +65,8 @@ pmix_process_info_t pmix_process_info = {0};
bool pmix_proc_is_bound = false; bool pmix_proc_is_bound = false;
bool ompi_singleton = false; bool ompi_singleton = false;
static pmix_proc_t myprocid;
static bool added_transport_keys = false; static bool added_transport_keys = false;
static bool added_num_procs = false; static bool added_num_procs = false;
static bool added_app_ctx = false; static bool added_app_ctx = false;
@ -498,7 +500,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
int ret; int ret;
char *error = NULL; char *error = NULL;
opal_process_name_t pname; opal_process_name_t pname;
pmix_proc_t myproc, rproc; pmix_proc_t rproc;
int u32, *u32ptr; int u32, *u32ptr;
uint16_t u16, *u16ptr; uint16_t u16, *u16ptr;
char **peers=NULL; char **peers=NULL;
@ -530,8 +532,11 @@ int ompi_rte_init(int *pargc, char ***pargv)
goto error; goto error;
} }
/* setup our internal nspace hack */
opal_pmix_setup_nspace_tracker();
/* initialize the selected module */ /* initialize the selected module */
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myproc, NULL, 0)))) { if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) {
/* we cannot run - this could be due to being direct launched /* we cannot run - this could be due to being direct launched
* without the required PMI support being built, so print * without the required PMI support being built, so print
* out a help message indicating it */ * out a help message indicating it */
@ -539,8 +544,8 @@ int ompi_rte_init(int *pargc, char ***pargv)
return OPAL_ERR_SILENT; return OPAL_ERR_SILENT;
} }
/* setup the process name fields */ /* setup the process name fields - also registers the new nspace */
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myproc); OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid);
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
return rc; return rc;
} }
@ -548,6 +553,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
OPAL_PROC_MY_NAME.vpid = pname.vpid; OPAL_PROC_MY_NAME.vpid = pname.vpid;
pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid; pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid; pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
/* set our hostname */ /* set our hostname */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME, OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME,
(char**)&ev1, PMIX_STRING); (char**)&ev1, PMIX_STRING);
@ -828,6 +834,10 @@ int ompi_rte_finalize(void)
free (pmix_process_info.cpuset); free (pmix_process_info.cpuset);
pmix_process_info.cpuset = NULL; pmix_process_info.cpuset = NULL;
/* cleanup our internal nspace hack */
opal_pmix_finalize_nspace_tracker();
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -77,6 +77,83 @@ int opal_pmix_base_exchange(pmix_info_t *indat,
return opal_pmix_convert_status(rc); return opal_pmix_convert_status(rc);
} }
typedef struct {
opal_list_item_t super;
pmix_nspace_t nspace;
opal_jobid_t jobid;
} opal_nptr_t;
static OBJ_CLASS_INSTANCE(opal_nptr_t,
opal_list_item_t,
NULL, NULL);
static opal_list_t localnspaces;
void opal_pmix_setup_nspace_tracker(void)
{
/* check if we were launched by PRRTE */
if (NULL != getenv("PRRTE_LAUNCHED")) {
opal_process_info.nativelaunch = true;
}
OBJ_CONSTRUCT(&localnspaces, opal_list_t);
}
void opal_pmix_finalize_nspace_tracker(void)
{
OPAL_LIST_DESTRUCT(&localnspaces);
}
int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
{
opal_nptr_t *nptr;
/* zero out the nspace */
PMIX_LOAD_NSPACE(nspace, NULL);
if (opal_process_info.nativelaunch) {
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
return OPAL_SUCCESS;
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (jobid == nptr->jobid) {
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
return OPAL_SUCCESS;
}
}
}
return OPAL_ERR_NOT_FOUND;
}
int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
{
opal_nptr_t *nptr;
opal_jobid_t jid;
/* set a default */
*jobid = OPAL_JOBID_INVALID;
if (opal_process_info.nativelaunch) {
return opal_convert_string_to_jobid(jobid, nspace);
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
*jobid = nptr->jobid;
return OPAL_SUCCESS;
}
}
/* if we get here, we don't know this nspace */
OPAL_HASH_STR(nspace, jid);
*jobid = jid;
nptr = OBJ_NEW(opal_nptr_t);
nptr->jobid = jid;
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
opal_list_append(&localnspaces, &nptr->super);
}
return OPAL_SUCCESS;
}
pmix_status_t opal_pmix_convert_rc(int rc) pmix_status_t opal_pmix_convert_rc(int rc)
{ {
switch (rc) { switch (rc) {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2019 Research Organization for Information Science * Copyright (c) 2019 Research Organization for Information Science
@ -442,9 +442,13 @@ OPAL_DECLSPEC pmix_proc_state_t opal_pmix_convert_state(int state);
OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t); OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t);
OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc); OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc);
OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status); OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
OPAL_DECLSPEC int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid);
OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace);
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
#define OPAL_PMIX_CONVERT_JOBID(n, j) \ #define OPAL_PMIX_CONVERT_JOBID(n, j) \
(void)opal_snprintf_jobid((n), PMIX_MAX_NSLEN, (j)) opal_pmix_convert_jobid((n), (j))
#define OPAL_PMIX_CONVERT_VPID(r, v) \ #define OPAL_PMIX_CONVERT_VPID(r, v) \
do { \ do { \
@ -454,6 +458,7 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
(r) = (v); \ (r) = (v); \
} \ } \
} while(0) } while(0)
#define OPAL_PMIX_CONVERT_NAME(p, n) \ #define OPAL_PMIX_CONVERT_NAME(p, n) \
do { \ do { \
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \ OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
@ -462,15 +467,17 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \ #define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
(r) = opal_convert_string_to_jobid((j), (n)) (r) = opal_pmix_convert_nspace((j), (n))
#define OPAL_PMIX_CONVERT_RANK(v, r) \ #define OPAL_PMIX_CONVERT_RANK(v, r) \
do { \ do { \
if (PMIX_RANK_WILDCARD == (r)) { \ if (PMIX_RANK_WILDCARD == (r)) { \
(v) = OPAL_VPID_WILDCARD; \ (v) = OPAL_VPID_WILDCARD; \
} else { \ } else if (PMIX_RANK_INVALID == (r)) { \
(v) = (r); \ (v) = OPAL_VPID_INVALID; \
} \ } else { \
(v) = (r); \
} \
} while(0) } while(0)
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \ #define OPAL_PMIX_CONVERT_PROCT(r, n, p) \

Просмотреть файл

@ -28,6 +28,7 @@ opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCAR
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}; opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
opal_process_info_t opal_process_info = { opal_process_info_t opal_process_info = {
.nativelaunch = false,
.nodename = NULL, .nodename = NULL,
.top_session_dir = NULL, .top_session_dir = NULL,
.job_session_dir = NULL, .job_session_dir = NULL,

Просмотреть файл

@ -23,7 +23,6 @@
#include "opal/types.h" #include "opal/types.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
#include <arpa/inet.h> #include <arpa/inet.h>
#endif #endif
@ -105,6 +104,7 @@ typedef struct {
OBJ_CLASS_DECLARATION(opal_namelist_t); OBJ_CLASS_DECLARATION(opal_namelist_t);
typedef struct opal_process_info_t { typedef struct opal_process_info_t {
bool nativelaunch; /**< launched by mpirun */
char *nodename; /**< string name for this node */ char *nodename; /**< string name for this node */
char *top_session_dir; /**< Top-level session directory */ char *top_session_dir; /**< Top-level session directory */
char *job_session_dir; /**< Session directory for job */ char *job_session_dir; /**< Session directory for job */

2
prrte

@ -1 +1 @@
Subproject commit 53296629f9aae70a6cd2586c77306a499e5b573a Subproject commit c2e2231cc47c3df0fb3e40c130a9fecd1ca5cacf