Resolve the naming issue when direct-launched by PMIx-enabled RMs using a minimal-impact approach. Detect if we were launched via ORTE - if so, then use our standard methods for computing the jobid. If not, then just hash the nspace to create the jobid, and track the jobid <-> nspace correspondece down in the opal/mca/pmix/pmix1xx component. We then do the translation any time a function that passes process names is invoked.
Этот коммит содержится в:
родитель
d8ba1cf426
Коммит
fad5638596
@ -23,6 +23,7 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "opal/hash_string.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
@ -30,8 +31,19 @@
|
||||
#include "opal/mca/pmix/pmix1xx/pmix/include/pmix.h"
|
||||
#include "opal/mca/pmix/pmix1xx/pmix/src/buffer_ops/buffer_ops.h"
|
||||
|
||||
static pmix_proc_t myproc;
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
opal_jobid_t jobid;
|
||||
char nspace[PMIX_MAX_NSLEN + 1];
|
||||
} opal_pmix1_jobid_trkr_t;
|
||||
static OBJ_CLASS_INSTANCE(opal_pmix1_jobid_trkr_t,
|
||||
opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
static pmix_proc_t my_proc;
|
||||
static char *dbgvalue=NULL;
|
||||
static opal_list_t jobids;
|
||||
static bool native_launch = false;
|
||||
|
||||
static void myerr(pmix_status_t status,
|
||||
pmix_proc_t procs[], size_t nprocs,
|
||||
@ -79,18 +91,29 @@ int pmix1_client_init(void)
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"PMIx_client init");
|
||||
|
||||
OBJ_CONSTRUCT(&jobids, opal_list_t);
|
||||
|
||||
if (0 < (dbg = opal_output_get_verbosity(opal_pmix_base_framework.framework_output))) {
|
||||
asprintf(&dbgvalue, "PMIX_DEBUG=%d", dbg);
|
||||
putenv(dbgvalue);
|
||||
}
|
||||
rc = PMIx_Init(&myproc);
|
||||
rc = PMIx_Init(&my_proc);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
return pmix1_convert_rc(rc);
|
||||
}
|
||||
|
||||
/* store our jobid and rank */
|
||||
opal_convert_string_to_jobid(&pname.jobid, myproc.nspace);
|
||||
pname.vpid = myproc.rank;
|
||||
if (NULL != getenv(OPAL_MCA_PREFIX"orte_launch")) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
native_launch = true;
|
||||
opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(my_proc.nspace, pname.jobid);
|
||||
}
|
||||
pname.vpid = my_proc.rank;
|
||||
opal_proc_set_name(&pname);
|
||||
|
||||
/* register the errhandler */
|
||||
@ -110,6 +133,9 @@ int pmix1_client_finalize(void)
|
||||
PMIx_Deregister_errhandler();
|
||||
|
||||
rc = PMIx_Finalize();
|
||||
|
||||
OPAL_LIST_DESTRUCT(&jobids);
|
||||
|
||||
return pmix1_convert_rc(rc);
|
||||
}
|
||||
|
||||
@ -131,7 +157,7 @@ int pmix1_abort(int flag, const char *msg,
|
||||
pmix_proc_t *parray=NULL;
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
char *tmp;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"PMIx_client abort");
|
||||
@ -142,9 +168,20 @@ int pmix1_abort(int flag, const char *msg,
|
||||
PMIX_PROC_CREATE(parray, cnt);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
tmp = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(parray[n].nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
parray[n].rank = ptr->name.vpid;
|
||||
++n;
|
||||
}
|
||||
@ -184,7 +221,7 @@ int pmix1_fence(opal_list_t *procs, int collect_data)
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
pmix_info_t info, *iptr;
|
||||
char *tmp;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"PMIx_client fence");
|
||||
@ -195,9 +232,20 @@ int pmix1_fence(opal_list_t *procs, int collect_data)
|
||||
PMIX_PROC_CREATE(parray, cnt);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
tmp = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(parray[n].nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
parray[n].rank = ptr->name.vpid;
|
||||
++n;
|
||||
}
|
||||
@ -236,7 +284,7 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_namelist_t *ptr;
|
||||
pmix1_opcaddy_t *op;
|
||||
pmix_info_t info, *iptr;
|
||||
char *tmp;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"PMIx_client fence_nb");
|
||||
@ -247,9 +295,20 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
|
||||
PMIX_PROC_CREATE(parray, cnt);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
tmp = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(parray[n].nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
parray[n].rank = ptr->name.vpid;
|
||||
++n;
|
||||
}
|
||||
@ -309,7 +368,7 @@ int pmix1_get(const opal_process_name_t *proc, const char *key,
|
||||
size_t ninfo, n;
|
||||
pmix_info_t *pinfo;
|
||||
opal_value_t *ival;
|
||||
char *tmp;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"%s PMIx_client get on proc %s key %s",
|
||||
@ -319,23 +378,33 @@ int pmix1_get(const opal_process_name_t *proc, const char *key,
|
||||
/* prep default response */
|
||||
*val = NULL;
|
||||
if (NULL != proc) {
|
||||
/* convert the process jobid */
|
||||
tmp = opal_convert_jobid_to_string(proc->jobid);
|
||||
(void)strncpy(p.nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == proc->jobid) {
|
||||
(void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == proc->jobid) {
|
||||
(void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
p.rank = proc->vpid;
|
||||
pptr = &p;
|
||||
} else {
|
||||
/* if they are asking for our jobid, then return our nspace */
|
||||
/* if they are asking for our jobid, then return it */
|
||||
if (0 == strcmp(key, OPAL_PMIX_JOBID)) {
|
||||
(*val) = OBJ_NEW(opal_value_t);
|
||||
(*val)->type = OPAL_STRING;
|
||||
(*val)->data.string = strdup(myproc.nspace);
|
||||
(*val)->type = OPAL_UINT32;
|
||||
(*val)->data.uint32 = OPAL_PROC_MY_NAME.jobid;
|
||||
return OPAL_SUCCESS;
|
||||
} else if (0 == strcmp(key, OPAL_PMIX_RANK)) {
|
||||
(*val) = OBJ_NEW(opal_value_t);
|
||||
(*val)->type = OPAL_INT;
|
||||
(*val)->data.integer = myproc.rank;
|
||||
(*val)->data.integer = my_proc.rank;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
pptr = NULL;
|
||||
@ -400,9 +469,9 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
|
||||
{
|
||||
pmix1_opcaddy_t *op;
|
||||
pmix_status_t rc;
|
||||
char *tmp;
|
||||
size_t n;
|
||||
opal_value_t *ival;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"%s PMIx_client get_nb on proc %s key %s",
|
||||
@ -415,13 +484,23 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
|
||||
op->cbdata = cbdata;
|
||||
|
||||
if (NULL != proc) {
|
||||
/* convert the process jobid */
|
||||
tmp = strdup(opal_convert_jobid_to_string(proc->jobid));
|
||||
(void)strncpy(op->p.nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == proc->jobid) {
|
||||
(void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == proc->jobid) {
|
||||
(void)strncpy(op->p.nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
op->p.rank = proc->vpid;
|
||||
} else {
|
||||
(void)strncpy(op->p.nspace, myproc.nspace, PMIX_MAX_NSLEN);
|
||||
(void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
op->p.rank = PMIX_RANK_WILDCARD;
|
||||
}
|
||||
|
||||
@ -559,11 +638,14 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info)
|
||||
/* transfer the data back */
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(d, data, opal_pmix_pdata_t) {
|
||||
rc = opal_convert_string_to_jobid(&d->proc.jobid, pdata[n].proc.nspace);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
PMIX_PDATA_FREE(pdata, sz);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
if (native_launch) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
opal_convert_string_to_jobid(&d->proc.jobid, pdata[n].proc.nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(pdata[n].proc.nspace, d->proc.jobid);
|
||||
}
|
||||
if (PMIX_RANK_WILDCARD == pdata[n].proc.rank) {
|
||||
d->proc.vpid = OPAL_VPID_WILDCARD;
|
||||
@ -604,11 +686,14 @@ static void lk_cbfunc(pmix_status_t status,
|
||||
for (n=0; n < ndata; n++) {
|
||||
d = OBJ_NEW(opal_pmix_pdata_t);
|
||||
opal_list_append(&results, &d->super);
|
||||
rc = opal_convert_string_to_jobid(&d->proc.jobid, data[n].proc.nspace);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
rc = OPAL_ERR_BAD_PARAM;
|
||||
OPAL_ERROR_LOG(rc);
|
||||
goto release;
|
||||
if (native_launch) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
opal_convert_string_to_jobid(&d->proc.jobid, data[n].proc.nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(data[n].proc.nspace, d->proc.jobid);
|
||||
}
|
||||
if (PMIX_RANK_WILDCARD == data[n].proc.rank) {
|
||||
d->proc.vpid = OPAL_VPID_WILDCARD;
|
||||
@ -740,6 +825,7 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid)
|
||||
char nspace[PMIX_MAX_NSLEN+1];
|
||||
opal_value_t *info;
|
||||
opal_pmix_app_t *app;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
if (NULL != job_info && 0 < (ninfo = opal_list_get_size(job_info))) {
|
||||
PMIX_INFO_CREATE(pinfo, ninfo);
|
||||
@ -774,7 +860,20 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid)
|
||||
|
||||
ret = PMIx_Spawn(pinfo, ninfo, papps, napps, nspace);
|
||||
if (PMIX_SUCCESS == ret) {
|
||||
(void)opal_convert_string_to_jobid(jobid, nspace);
|
||||
if (native_launch) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
opal_convert_string_to_jobid(jobid, nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(nspace, *jobid);
|
||||
/* add this to our jobid tracker */
|
||||
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
|
||||
(void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN);
|
||||
job->jobid = *jobid;
|
||||
opal_list_append(&jobids, &job->super);
|
||||
}
|
||||
}
|
||||
PMIX_APP_FREE(papps, napps);
|
||||
|
||||
@ -787,10 +886,24 @@ static void spcbfunc(pmix_status_t status,
|
||||
pmix1_opcaddy_t *op = (pmix1_opcaddy_t*)cbdata;
|
||||
int rc;
|
||||
opal_jobid_t jobid=OPAL_JOBID_INVALID;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
rc = pmix1_convert_rc(status);
|
||||
if (PMIX_SUCCESS == status) {
|
||||
rc = opal_convert_string_to_jobid(&jobid, nspace);
|
||||
if (native_launch) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
opal_convert_string_to_jobid(&jobid, nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(nspace, jobid);
|
||||
/* add this to our jobid tracker */
|
||||
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
|
||||
(void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN);
|
||||
job->jobid = jobid;
|
||||
opal_list_append(&jobids, &job->super);
|
||||
}
|
||||
}
|
||||
|
||||
op->spcbfunc(rc, jobid, op->cbdata);
|
||||
@ -853,7 +966,7 @@ int pmix1_connect(opal_list_t *procs)
|
||||
pmix_proc_t *parray=NULL;
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
char *strname;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
/* protect against bozo error */
|
||||
if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) {
|
||||
@ -865,9 +978,20 @@ int pmix1_connect(opal_list_t *procs)
|
||||
PMIX_PROC_CREATE(parray, cnt);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
strname = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(parray[n].nspace, strname, PMIX_MAX_NSLEN);
|
||||
free(strname);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
|
||||
parray[n].rank = PMIX_RANK_WILDCARD;
|
||||
} else {
|
||||
@ -890,7 +1014,7 @@ int pmix1_connectnb(opal_list_t *procs,
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
pmix1_opcaddy_t *op;
|
||||
char *strname;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
/* protect against bozo error */
|
||||
if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) {
|
||||
@ -908,9 +1032,20 @@ int pmix1_connectnb(opal_list_t *procs,
|
||||
PMIX_PROC_CREATE(op->procs, op->nprocs);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
strname = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(op->procs[n].nspace, strname, PMIX_MAX_NSLEN);
|
||||
free(strname);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
|
||||
op->procs[n].rank = PMIX_RANK_WILDCARD;
|
||||
} else {
|
||||
@ -930,7 +1065,7 @@ int pmix1_disconnect(opal_list_t *procs)
|
||||
pmix_proc_t *parray=NULL;
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
char *tmp;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
/* protect against bozo error */
|
||||
if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) {
|
||||
@ -942,9 +1077,20 @@ int pmix1_disconnect(opal_list_t *procs)
|
||||
PMIX_PROC_CREATE(parray, cnt);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
tmp = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(parray[n].nspace, tmp, PMIX_MAX_NSLEN);
|
||||
free(tmp);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
|
||||
parray[n].rank = PMIX_RANK_WILDCARD;
|
||||
} else {
|
||||
@ -967,7 +1113,7 @@ int pmix1_disconnectnb(opal_list_t *procs,
|
||||
size_t n, cnt=0;
|
||||
opal_namelist_t *ptr;
|
||||
pmix1_opcaddy_t *op;
|
||||
char *strname;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
/* protect against bozo error */
|
||||
if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) {
|
||||
@ -985,9 +1131,20 @@ int pmix1_disconnectnb(opal_list_t *procs,
|
||||
PMIX_PROC_CREATE(op->procs, op->nprocs);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
|
||||
strname = opal_convert_jobid_to_string(ptr->name.jobid);
|
||||
(void)strncpy(op->procs[n].nspace, strname, PMIX_MAX_NSLEN);
|
||||
free(strname);
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
|
||||
(void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == ptr->name.jobid) {
|
||||
(void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
|
||||
op->procs[n].rank = PMIX_RANK_WILDCARD;
|
||||
} else {
|
||||
@ -1028,10 +1185,14 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
|
||||
for (n=0; n < nprocs; n++) {
|
||||
nm = OBJ_NEW(opal_namelist_t);
|
||||
opal_list_append(procs, &nm->super);
|
||||
rc = opal_convert_string_to_jobid(&nm->name.jobid, array[n].nspace);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
PMIX_PROC_FREE(array, nprocs);
|
||||
return rc;
|
||||
if (native_launch) {
|
||||
/* if we were launched by the OMPI RTE, then
|
||||
* the jobid is in a special format - so get it */
|
||||
opal_convert_string_to_jobid(&nm->name.jobid, array[n].nspace);
|
||||
} else {
|
||||
/* we were launched by someone else, so make the
|
||||
* jobid just be the hash of the nspace */
|
||||
OPAL_HASH_STR(array[n].nspace, nm->name.jobid);
|
||||
}
|
||||
nm->name.vpid = array[n].rank;
|
||||
}
|
||||
@ -1044,12 +1205,24 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
|
||||
int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist)
|
||||
{
|
||||
pmix_status_t ret;
|
||||
char *nspace;
|
||||
char *nspace=NULL;
|
||||
opal_pmix1_jobid_trkr_t *job;
|
||||
|
||||
if (OPAL_JOBID_WILDCARD == jobid) {
|
||||
nspace = NULL;
|
||||
} else {
|
||||
nspace = opal_convert_jobid_to_string(jobid);
|
||||
if (OPAL_JOBID_WILDCARD != jobid) {
|
||||
/* if the jobid is my own, then we can just use
|
||||
* my namespace */
|
||||
if (OPAL_PROC_MY_NAME.jobid == jobid) {
|
||||
nspace = strdup(my_proc.nspace);
|
||||
} else {
|
||||
/* look thru our list of jobids and find the
|
||||
* corresponding nspace */
|
||||
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
|
||||
if (job->jobid == jobid) {
|
||||
nspace = strdup(job->nspace);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret = PMIx_Resolve_nodes(nspace, nodelist);
|
||||
|
@ -44,7 +44,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
|
||||
|
||||
/* information about relative ranks as assigned by the RM */
|
||||
#define OPAL_PMIX_JOBID "pmix.jobid" // (char*) jobid assigned by scheduler
|
||||
#define OPAL_PMIX_JOBID "pmix.jobid" // (uint32_t) jobid assigned by scheduler
|
||||
#define OPAL_PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job
|
||||
#define OPAL_PMIX_RANK "pmix.rank" // (uint32_t) process rank within the job
|
||||
#define OPAL_PMIX_GLOBAL_RANK "pmix.grank" // (uint32_t) rank spanning across all jobs in this session
|
||||
|
@ -233,8 +233,8 @@ static int s1_init(void)
|
||||
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_JOBID);
|
||||
kv.type = OPAL_STRING;
|
||||
kv.data.string = pmix_id;
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = s1_pname.jobid;
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
|
@ -242,8 +242,8 @@ static int s2_init(void)
|
||||
*/
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_JOBID);
|
||||
kv.type = OPAL_STRING;
|
||||
kv.data.string = pmix_kvs_name;
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = s2_pname.jobid;
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
|
@ -861,6 +861,8 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
/* tell the child that it is being launched via ORTE */
|
||||
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
|
||||
|
||||
/* ensure we clear any prior info regarding state or exit status in
|
||||
* case this is a restart
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user