Create a hack to protect against non-integer jobids
If someone gives us a namespace that doesn't easily translate to an integer, we have to create a mechanism for working around the disconnect. PRRTE has been updated to give us a flag so we know we were "natively" launched. If we don't see it, then fall back to generating a hash of the nspace as our jobid. We then have to translate back/forth between nspace and jobid using a lookup table. Probably not the right long-term solution, but hopefully helps get us thru for a bit. Includes update of PRRTE pointer Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
родитель
dd5991f513
Коммит
829fd478b3
@ -65,6 +65,8 @@ pmix_process_info_t pmix_process_info = {0};
|
|||||||
bool pmix_proc_is_bound = false;
|
bool pmix_proc_is_bound = false;
|
||||||
bool ompi_singleton = false;
|
bool ompi_singleton = false;
|
||||||
|
|
||||||
|
static pmix_proc_t myprocid;
|
||||||
|
|
||||||
static bool added_transport_keys = false;
|
static bool added_transport_keys = false;
|
||||||
static bool added_num_procs = false;
|
static bool added_num_procs = false;
|
||||||
static bool added_app_ctx = false;
|
static bool added_app_ctx = false;
|
||||||
@ -498,7 +500,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
|||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
opal_process_name_t pname;
|
opal_process_name_t pname;
|
||||||
pmix_proc_t myproc, rproc;
|
pmix_proc_t rproc;
|
||||||
int u32, *u32ptr;
|
int u32, *u32ptr;
|
||||||
uint16_t u16, *u16ptr;
|
uint16_t u16, *u16ptr;
|
||||||
char **peers=NULL;
|
char **peers=NULL;
|
||||||
@ -530,8 +532,11 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup our internal nspace hack */
|
||||||
|
opal_pmix_setup_nspace_tracker();
|
||||||
|
|
||||||
/* initialize the selected module */
|
/* initialize the selected module */
|
||||||
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myproc, NULL, 0)))) {
|
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) {
|
||||||
/* we cannot run - this could be due to being direct launched
|
/* we cannot run - this could be due to being direct launched
|
||||||
* without the required PMI support being built, so print
|
* without the required PMI support being built, so print
|
||||||
* out a help message indicating it */
|
* out a help message indicating it */
|
||||||
@ -539,8 +544,8 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
|||||||
return OPAL_ERR_SILENT;
|
return OPAL_ERR_SILENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the process name fields */
|
/* setup the process name fields - also registers the new nspace */
|
||||||
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myproc);
|
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid);
|
||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -548,6 +553,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
|||||||
OPAL_PROC_MY_NAME.vpid = pname.vpid;
|
OPAL_PROC_MY_NAME.vpid = pname.vpid;
|
||||||
pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
|
pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||||
pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
|
pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
|
||||||
|
|
||||||
/* set our hostname */
|
/* set our hostname */
|
||||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME,
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME,
|
||||||
(char**)&ev1, PMIX_STRING);
|
(char**)&ev1, PMIX_STRING);
|
||||||
@ -828,6 +834,10 @@ int ompi_rte_finalize(void)
|
|||||||
free (pmix_process_info.cpuset);
|
free (pmix_process_info.cpuset);
|
||||||
pmix_process_info.cpuset = NULL;
|
pmix_process_info.cpuset = NULL;
|
||||||
|
|
||||||
|
/* cleanup our internal nspace hack */
|
||||||
|
opal_pmix_finalize_nspace_tracker();
|
||||||
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,6 +77,83 @@ int opal_pmix_base_exchange(pmix_info_t *indat,
|
|||||||
return opal_pmix_convert_status(rc);
|
return opal_pmix_convert_status(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
opal_list_item_t super;
|
||||||
|
pmix_nspace_t nspace;
|
||||||
|
opal_jobid_t jobid;
|
||||||
|
} opal_nptr_t;
|
||||||
|
static OBJ_CLASS_INSTANCE(opal_nptr_t,
|
||||||
|
opal_list_item_t,
|
||||||
|
NULL, NULL);
|
||||||
|
|
||||||
|
static opal_list_t localnspaces;
|
||||||
|
|
||||||
|
void opal_pmix_setup_nspace_tracker(void)
|
||||||
|
{
|
||||||
|
/* check if we were launched by PRRTE */
|
||||||
|
if (NULL != getenv("PRRTE_LAUNCHED")) {
|
||||||
|
opal_process_info.nativelaunch = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&localnspaces, opal_list_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
void opal_pmix_finalize_nspace_tracker(void)
|
||||||
|
{
|
||||||
|
OPAL_LIST_DESTRUCT(&localnspaces);
|
||||||
|
}
|
||||||
|
|
||||||
|
int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
|
||||||
|
{
|
||||||
|
opal_nptr_t *nptr;
|
||||||
|
|
||||||
|
/* zero out the nspace */
|
||||||
|
PMIX_LOAD_NSPACE(nspace, NULL);
|
||||||
|
|
||||||
|
if (opal_process_info.nativelaunch) {
|
||||||
|
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
} else {
|
||||||
|
/* cycle across our list of known jobids */
|
||||||
|
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
|
||||||
|
if (jobid == nptr->jobid) {
|
||||||
|
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return OPAL_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
|
||||||
|
{
|
||||||
|
opal_nptr_t *nptr;
|
||||||
|
opal_jobid_t jid;
|
||||||
|
|
||||||
|
/* set a default */
|
||||||
|
*jobid = OPAL_JOBID_INVALID;
|
||||||
|
|
||||||
|
if (opal_process_info.nativelaunch) {
|
||||||
|
return opal_convert_string_to_jobid(jobid, nspace);
|
||||||
|
} else {
|
||||||
|
/* cycle across our list of known jobids */
|
||||||
|
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
|
||||||
|
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
|
||||||
|
*jobid = nptr->jobid;
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* if we get here, we don't know this nspace */
|
||||||
|
OPAL_HASH_STR(nspace, jid);
|
||||||
|
*jobid = jid;
|
||||||
|
nptr = OBJ_NEW(opal_nptr_t);
|
||||||
|
nptr->jobid = jid;
|
||||||
|
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
|
||||||
|
opal_list_append(&localnspaces, &nptr->super);
|
||||||
|
}
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
pmix_status_t opal_pmix_convert_rc(int rc)
|
pmix_status_t opal_pmix_convert_rc(int rc)
|
||||||
{
|
{
|
||||||
switch (rc) {
|
switch (rc) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2019 Research Organization for Information Science
|
* Copyright (c) 2019 Research Organization for Information Science
|
||||||
@ -442,9 +442,13 @@ OPAL_DECLSPEC pmix_proc_state_t opal_pmix_convert_state(int state);
|
|||||||
OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t);
|
OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t);
|
||||||
OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc);
|
OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc);
|
||||||
OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
|
OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
|
||||||
|
OPAL_DECLSPEC int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid);
|
||||||
|
OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace);
|
||||||
|
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
|
||||||
|
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
|
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
|
||||||
(void)opal_snprintf_jobid((n), PMIX_MAX_NSLEN, (j))
|
opal_pmix_convert_jobid((n), (j))
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_VPID(r, v) \
|
#define OPAL_PMIX_CONVERT_VPID(r, v) \
|
||||||
do { \
|
do { \
|
||||||
@ -454,6 +458,7 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
|
|||||||
(r) = (v); \
|
(r) = (v); \
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_NAME(p, n) \
|
#define OPAL_PMIX_CONVERT_NAME(p, n) \
|
||||||
do { \
|
do { \
|
||||||
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
|
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
|
||||||
@ -462,15 +467,17 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
|
|||||||
|
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
|
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
|
||||||
(r) = opal_convert_string_to_jobid((j), (n))
|
(r) = opal_pmix_convert_nspace((j), (n))
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_RANK(v, r) \
|
#define OPAL_PMIX_CONVERT_RANK(v, r) \
|
||||||
do { \
|
do { \
|
||||||
if (PMIX_RANK_WILDCARD == (r)) { \
|
if (PMIX_RANK_WILDCARD == (r)) { \
|
||||||
(v) = OPAL_VPID_WILDCARD; \
|
(v) = OPAL_VPID_WILDCARD; \
|
||||||
} else { \
|
} else if (PMIX_RANK_INVALID == (r)) { \
|
||||||
(v) = (r); \
|
(v) = OPAL_VPID_INVALID; \
|
||||||
} \
|
} else { \
|
||||||
|
(v) = (r); \
|
||||||
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \
|
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \
|
||||||
|
@ -28,6 +28,7 @@ opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCAR
|
|||||||
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
|
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
|
||||||
|
|
||||||
opal_process_info_t opal_process_info = {
|
opal_process_info_t opal_process_info = {
|
||||||
|
.nativelaunch = false,
|
||||||
.nodename = NULL,
|
.nodename = NULL,
|
||||||
.top_session_dir = NULL,
|
.top_session_dir = NULL,
|
||||||
.job_session_dir = NULL,
|
.job_session_dir = NULL,
|
||||||
|
@ -23,7 +23,6 @@
|
|||||||
#include "opal/types.h"
|
#include "opal/types.h"
|
||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
#endif
|
#endif
|
||||||
@ -105,6 +104,7 @@ typedef struct {
|
|||||||
OBJ_CLASS_DECLARATION(opal_namelist_t);
|
OBJ_CLASS_DECLARATION(opal_namelist_t);
|
||||||
|
|
||||||
typedef struct opal_process_info_t {
|
typedef struct opal_process_info_t {
|
||||||
|
bool nativelaunch; /**< launched by mpirun */
|
||||||
char *nodename; /**< string name for this node */
|
char *nodename; /**< string name for this node */
|
||||||
char *top_session_dir; /**< Top-level session directory */
|
char *top_session_dir; /**< Top-level session directory */
|
||||||
char *job_session_dir; /**< Session directory for job */
|
char *job_session_dir; /**< Session directory for job */
|
||||||
|
2
prrte
2
prrte
@ -1 +1 @@
|
|||||||
Subproject commit 53296629f9aae70a6cd2586c77306a499e5b573a
|
Subproject commit c2e2231cc47c3df0fb3e40c130a9fecd1ca5cacf
|
Загрузка…
x
Ссылка в новой задаче
Block a user