1
1

Merge pull request #7441 from rhc54/topic/hack

Create a hack to protect against non-integer jobids
Этот коммит содержится в:
Ralph Castain 2020-02-21 11:28:51 -08:00 коммит произвёл GitHub
родитель 6d34b064be 829fd478b3
Коммит f9643b84b9
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 111 добавлений и 16 удалений

Просмотреть файл

@ -65,6 +65,8 @@ pmix_process_info_t pmix_process_info = {0};
bool pmix_proc_is_bound = false;
bool ompi_singleton = false;
static pmix_proc_t myprocid;
static bool added_transport_keys = false;
static bool added_num_procs = false;
static bool added_app_ctx = false;
@ -498,7 +500,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
int ret;
char *error = NULL;
opal_process_name_t pname;
pmix_proc_t myproc, rproc;
pmix_proc_t rproc;
int u32, *u32ptr;
uint16_t u16, *u16ptr;
char **peers=NULL;
@ -530,8 +532,11 @@ int ompi_rte_init(int *pargc, char ***pargv)
goto error;
}
/* setup our internal nspace hack */
opal_pmix_setup_nspace_tracker();
/* initialize the selected module */
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myproc, NULL, 0)))) {
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) {
/* we cannot run - this could be due to being direct launched
* without the required PMI support being built, so print
* out a help message indicating it */
@ -539,8 +544,8 @@ int ompi_rte_init(int *pargc, char ***pargv)
return OPAL_ERR_SILENT;
}
/* setup the process name fields */
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myproc);
/* setup the process name fields - also registers the new nspace */
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid);
if (OPAL_SUCCESS != rc) {
return rc;
}
@ -548,6 +553,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
OPAL_PROC_MY_NAME.vpid = pname.vpid;
pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
/* set our hostname */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME,
(char**)&ev1, PMIX_STRING);
@ -828,6 +834,10 @@ int ompi_rte_finalize(void)
free (pmix_process_info.cpuset);
pmix_process_info.cpuset = NULL;
/* cleanup our internal nspace hack */
opal_pmix_finalize_nspace_tracker();
return OMPI_SUCCESS;
}

Просмотреть файл

@ -77,6 +77,83 @@ int opal_pmix_base_exchange(pmix_info_t *indat,
return opal_pmix_convert_status(rc);
}
typedef struct {
opal_list_item_t super;
pmix_nspace_t nspace;
opal_jobid_t jobid;
} opal_nptr_t;
static OBJ_CLASS_INSTANCE(opal_nptr_t,
opal_list_item_t,
NULL, NULL);
static opal_list_t localnspaces;
void opal_pmix_setup_nspace_tracker(void)
{
/* check if we were launched by PRRTE */
if (NULL != getenv("PRRTE_LAUNCHED")) {
opal_process_info.nativelaunch = true;
}
OBJ_CONSTRUCT(&localnspaces, opal_list_t);
}
void opal_pmix_finalize_nspace_tracker(void)
{
OPAL_LIST_DESTRUCT(&localnspaces);
}
int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
{
opal_nptr_t *nptr;
/* zero out the nspace */
PMIX_LOAD_NSPACE(nspace, NULL);
if (opal_process_info.nativelaunch) {
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
return OPAL_SUCCESS;
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (jobid == nptr->jobid) {
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
return OPAL_SUCCESS;
}
}
}
return OPAL_ERR_NOT_FOUND;
}
int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
{
opal_nptr_t *nptr;
opal_jobid_t jid;
/* set a default */
*jobid = OPAL_JOBID_INVALID;
if (opal_process_info.nativelaunch) {
return opal_convert_string_to_jobid(jobid, nspace);
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
*jobid = nptr->jobid;
return OPAL_SUCCESS;
}
}
/* if we get here, we don't know this nspace */
OPAL_HASH_STR(nspace, jid);
*jobid = jid;
nptr = OBJ_NEW(opal_nptr_t);
nptr->jobid = jid;
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
opal_list_append(&localnspaces, &nptr->super);
}
return OPAL_SUCCESS;
}
pmix_status_t opal_pmix_convert_rc(int rc)
{
switch (rc) {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Research Organization for Information Science
@ -442,9 +442,13 @@ OPAL_DECLSPEC pmix_proc_state_t opal_pmix_convert_state(int state);
OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t);
OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc);
OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
OPAL_DECLSPEC int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid);
OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace);
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
(void)opal_snprintf_jobid((n), PMIX_MAX_NSLEN, (j))
opal_pmix_convert_jobid((n), (j))
#define OPAL_PMIX_CONVERT_VPID(r, v) \
do { \
@ -454,6 +458,7 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
(r) = (v); \
} \
} while(0)
#define OPAL_PMIX_CONVERT_NAME(p, n) \
do { \
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
@ -462,15 +467,17 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
(r) = opal_convert_string_to_jobid((j), (n))
(r) = opal_pmix_convert_nspace((j), (n))
#define OPAL_PMIX_CONVERT_RANK(v, r) \
do { \
if (PMIX_RANK_WILDCARD == (r)) { \
(v) = OPAL_VPID_WILDCARD; \
} else { \
(v) = (r); \
} \
#define OPAL_PMIX_CONVERT_RANK(v, r) \
do { \
if (PMIX_RANK_WILDCARD == (r)) { \
(v) = OPAL_VPID_WILDCARD; \
} else if (PMIX_RANK_INVALID == (r)) { \
(v) = OPAL_VPID_INVALID; \
} else { \
(v) = (r); \
} \
} while(0)
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \

Просмотреть файл

@ -28,6 +28,7 @@ opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCAR
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
opal_process_info_t opal_process_info = {
.nativelaunch = false,
.nodename = NULL,
.top_session_dir = NULL,
.job_session_dir = NULL,

Просмотреть файл

@ -23,7 +23,6 @@
#include "opal/types.h"
#include "opal/dss/dss.h"
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
#include <arpa/inet.h>
#endif
@ -105,6 +104,7 @@ typedef struct {
OBJ_CLASS_DECLARATION(opal_namelist_t);
typedef struct opal_process_info_t {
bool nativelaunch; /**< launched by mpirun */
char *nodename; /**< string name for this node */
char *top_session_dir; /**< Top-level session directory */
char *job_session_dir; /**< Session directory for job */

2
prrte

@ -1 +1 @@
Subproject commit 53296629f9aae70a6cd2586c77306a499e5b573a
Subproject commit c2e2231cc47c3df0fb3e40c130a9fecd1ca5cacf