1
1
Deprecate --am and --amca options

Avoid default param files on backend nodes
Any parameters in the PRRTE default or user param files will have been
picked up by prte and included in the environment sent to the prted, so
don't open those files on the backend.

Avoid picking up MCA param file info on backend
Avoid the scaling problem at PRRTE startup by only reading the system
and user param files on the frontend.

Complete revisions to cmd line parser for OMPI
Per specification, enforce following precedence order:

1. system-level default parameter file
1. user-level default parameter file
1. Anything found in the environment
1. "--tune" files. Note that "--amca" goes away and becomes equivalent to "--tune". Okay if it is provided more than once on a cmd line (we will aggregate the list of files, retaining order), but an error if a parameter is referenced in more than one file with a different value
1. "--mca" options. Again, error if the same option appears more than once with a different value. Allowed to override a parameter referenced in a "tune" file
1. "-x" options. Allowed to overwrite options given in a "tune" file, but cannot conflict with an explicit "--mca" option
1. all other options

Fix special handling of "-np"

Get agreement on jobid across the layers
Need all three pieces (PRRTE, PMIx, and OPAL) to agree on the nspace
conversion to jobid method

Ensure prte show_help messages get output
Print abnormal termination messages
Cleanup error reporting in persistent operations

Signed-off-by: Ralph Castain <rhc@pmix.org>

dd

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-03-25 15:43:27 -07:00
родитель f9575ed026
Коммит 1cf972dcaf
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B63B630167D26BB5
4 изменённых файлов: 58 добавлений и 30 удалений

Просмотреть файл

@ -110,18 +110,14 @@ int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
/* zero out the nspace */
PMIX_LOAD_NSPACE(nspace, NULL);
if (opal_process_info.nativelaunch) {
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
return OPAL_SUCCESS;
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (jobid == nptr->jobid) {
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
return OPAL_SUCCESS;
}
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (jobid == nptr->jobid) {
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
return OPAL_SUCCESS;
}
}
return OPAL_ERR_NOT_FOUND;
}
@ -129,29 +125,55 @@ int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
{
opal_nptr_t *nptr;
opal_jobid_t jid;
uint16_t jobfam;
uint32_t hash32, localjob = 0;
char *p = NULL;
/* set a default */
*jobid = OPAL_JOBID_INVALID;
if (opal_process_info.nativelaunch) {
return opal_convert_string_to_jobid(jobid, nspace);
} else {
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
*jobid = nptr->jobid;
return OPAL_SUCCESS;
}
}
/* if we get here, we don't know this nspace */
OPAL_HASH_STR(nspace, jid);
jid &= ~(0x8000);
*jobid = jid;
nptr = OBJ_NEW(opal_nptr_t);
nptr->jobid = jid;
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
opal_list_append(&localnspaces, &nptr->super);
/* if the nspace is empty, there is nothing more to do */
if (0 == strlen(nspace)) {
return OPAL_SUCCESS;
}
if (NULL != strstr(nspace, "JOBID_WILDCARD")) {
*jobid = OPAL_JOBID_WILDCARD;
return OPAL_SUCCESS;
}
if (NULL != strstr(nspace, "JOBID_INVALID")) {
*jobid = OPAL_JOBID_INVALID;
return OPAL_SUCCESS;
}
/* cycle across our list of known jobids */
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
*jobid = nptr->jobid;
return OPAL_SUCCESS;
}
}
/* if we get here, we don't know this nspace */
/* find the "." at the end that indicates the child job */
if (NULL != (p = strrchr(nspace, '.'))) {
*p = '\0';
}
OPAL_HASH_STR(nspace, hash32);
if (NULL != p) {
*p = '.';
++p;
localjob = strtoul(p, NULL, 10);
}
/* now compress to 16-bits */
jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
jid = (0xffff0000 & ((uint32_t)jobfam << 16)) | (0x0000ffff & localjob);
*jobid = jid;
/* save this jobid/nspace pair */
nptr = OBJ_NEW(opal_nptr_t);
nptr->jobid = jid;
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
opal_list_append(&localnspaces, &nptr->super);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -595,9 +595,11 @@ OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t ns
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
/* convert jobid to nspace */
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
opal_pmix_convert_jobid((n), (j))
/* convert vpid to rank */
#define OPAL_PMIX_CONVERT_VPID(r, v) \
do { \
if (OPAL_VPID_WILDCARD == (v)) { \
@ -607,6 +609,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
} \
} while(0)
/* convert opal_process_name_t to pmix_proc_t */
#define OPAL_PMIX_CONVERT_NAME(p, n) \
do { \
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
@ -614,9 +617,11 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
} while(0)
/* convert nspace to jobid */
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
(r) = opal_pmix_convert_nspace((j), (n))
/* convert pmix rank to opal vpid */
#define OPAL_PMIX_CONVERT_RANK(v, r) \
do { \
if (PMIX_RANK_WILDCARD == (r)) { \
@ -628,6 +633,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
} \
} while(0)
/* convert pmix_proc_t to opal_process_name_t */
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \
do { \
OPAL_PMIX_CONVERT_NSPACE((r), &(n)->jobid, (p)->nspace); \

@ -1 +1 @@
Subproject commit a18e53138298d61a01fec4471518140304539e8c
Subproject commit 4c62a26b319ba78feadc42679200e93041f611a2

2
prrte

@ -1 +1 @@
Subproject commit cdea5231171b2fdea11269033de9e265fc7f3a63
Subproject commit 8d673047b325a148f55c65e049aab67f1de1d318