1
1

Cleanup the code a bit by simply adding our nspace to the top of the list of jobid <-> nspace correlations. Add two new APIs to opal_pmix for registering new jobid/nspace pairs and retrieving an nspace given a jobid - these are required to support connect/accept. No impact on the PMIx library.

Этот коммит содержится в:
Ralph Castain 2015-09-28 00:39:14 -07:00
родитель f713e71d51
Коммит a4a3dfd480
9 изменённых файлов: 349 добавлений и 199 удалений

Просмотреть файл

@ -139,7 +139,13 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
opal_argv_append_nosize(&members, nstring);
free(nstring);
/* have to add the number of procs in the job so the remote side
* can correctly add the procs by computing their names */
* can correctly add the procs by computing their names, and our nspace
* so they can update their records */
if (NULL == (nstring = (char*)opal_pmix.get_nspace(OMPI_PROC_MY_NAME->jobid))) {
opal_argv_free(members);
return OMPI_ERR_NOT_SUPPORTED;
}
opal_argv_append_nosize(&members, nstring);
(void)asprintf(&nstring, "%d", size);
opal_argv_append_nosize(&members, nstring);
free(nstring);
@ -171,6 +177,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
}
opal_argv_append_nosize(&members, nstring);
free(nstring);
if (NULL == (nstring = (char*)opal_pmix.get_nspace(proc_list[i]->super.proc_name.jobid))) {
opal_argv_free(members);
return OMPI_ERR_NOT_SUPPORTED;
}
opal_argv_append_nosize(&members, nstring);
}
if (!dense) {
free(proc_list);
@ -246,6 +257,17 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
OPAL_LIST_DESTRUCT(&mlist);
goto exit;
}
/* step over the nspace */
++i;
if (NULL == members[i]) {
/* this shouldn't happen and is an error */
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
OPAL_LIST_DESTRUCT(&mlist);
opal_argv_free(members);
free(rport);
rc = OMPI_ERR_BAD_PARAM;
goto exit;
}
/* if the rank is wildcard, then we need to add all procs
* in that job to the list */
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
@ -295,6 +317,16 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
OPAL_LIST_DESTRUCT(&rlist);
goto exit;
}
/* next entry is the nspace - register it */
++i;
if (NULL == members[i]) {
OMPI_ERROR_LOG(OMPI_ERR_NOT_SUPPORTED);
opal_argv_free(members);
OPAL_LIST_DESTRUCT(&ilist);
OPAL_LIST_DESTRUCT(&rlist);
goto exit;
}
opal_pmix.register_jobid(nm->name.jobid, members[i]);
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
jobid = nm->name.jobid;
OBJ_RELEASE(nm);

Просмотреть файл

@ -78,6 +78,9 @@ static int cray_unpublish_nb(char **keys, opal_list_t *info,
static const char *cray_get_version(void);
static int cray_store_local(const opal_process_name_t *proc,
opal_value_t *val);
static const char *cray_get_nspace(opal_jobid_t jobid);
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace);
#if 0
static bool cray_get_attr(const char *attr, opal_value_t **kv);
#endif
@ -109,7 +112,9 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
.get_version = cray_get_version,
.register_errhandler = opal_pmix_base_register_handler,
.deregister_errhandler = opal_pmix_base_deregister_handler,
.store_local = cray_store_local
.store_local = cray_store_local,
.get_nspace = cray_get_nspace,
.register_jobid = cray_register_jobid
};
// usage accounting
@ -814,6 +819,16 @@ static int cray_store_local(const opal_process_name_t *proc,
return OPAL_SUCCESS;
}
static const char *cray_get_nspace(opal_jobid_t jobid)
{
return NULL;
}
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace)
{
return;
}
static char* pmix_error(int pmix_err)
{
char * err_msg;

Просмотреть файл

@ -701,6 +701,12 @@ typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
typedef int (*opal_pmix_base_module_store_fn_t)(const opal_process_name_t *proc,
opal_value_t *val);
/* retrieve the nspace corresponding to a given jobid */
typedef const char* (*opal_pmix_base_module_get_nspace_fn_t)(opal_jobid_t jobid);
/* register a jobid-to-nspace pair */
typedef void (*opal_pmix_base_module_register_jobid_fn_t)(opal_jobid_t jobid, const char *nspace);
/*
* the standard public API data structure
*/
@ -745,6 +751,8 @@ typedef struct {
opal_pmix_base_module_register_fn_t register_errhandler;
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
opal_pmix_base_module_store_fn_t store_local;
opal_pmix_base_module_get_nspace_fn_t get_nspace;
opal_pmix_base_module_register_jobid_fn_t register_jobid;
} opal_pmix_base_module_t;
typedef struct {

Просмотреть файл

@ -30,11 +30,24 @@
BEGIN_C_DECLS
OPAL_DECLSPEC extern opal_pmix_base_component_t mca_pmix_pmix1_component;
typedef struct {
opal_pmix_base_component_t super;
opal_list_t jobids;
bool native_launch;
} mca_pmix_pmix1_component_t;
OPAL_DECLSPEC extern mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component;
OPAL_DECLSPEC extern const opal_pmix_base_module_t opal_pmix_pmix1xx_module;
/**** INTERNAL OBJECTS ****/
typedef struct {
opal_list_item_t super;
opal_jobid_t jobid;
char nspace[PMIX_MAX_NSLEN + 1];
} opal_pmix1_jobid_trkr_t;
OBJ_CLASS_DECLARATION(opal_pmix1_jobid_trkr_t);
typedef struct {
opal_object_t super;
pmix_proc_t p;

Просмотреть файл

@ -31,19 +31,8 @@
#include "opal/mca/pmix/pmix1xx/pmix/include/pmix.h"
#include "opal/mca/pmix/pmix1xx/pmix/src/buffer_ops/buffer_ops.h"
typedef struct {
opal_list_item_t super;
opal_jobid_t jobid;
char nspace[PMIX_MAX_NSLEN + 1];
} opal_pmix1_jobid_trkr_t;
static OBJ_CLASS_INSTANCE(opal_pmix1_jobid_trkr_t,
opal_list_item_t,
NULL, NULL);
static pmix_proc_t my_proc;
static char *dbgvalue=NULL;
static opal_list_t jobids;
static bool native_launch = false;
static void myerr(pmix_status_t status,
pmix_proc_t procs[], size_t nprocs,
@ -87,12 +76,11 @@ int pmix1_client_init(void)
opal_process_name_t pname;
pmix_status_t rc;
int dbg;
opal_pmix1_jobid_trkr_t *job;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"PMIx_client init");
OBJ_CONSTRUCT(&jobids, opal_list_t);
if (0 < (dbg = opal_output_get_verbosity(opal_pmix_base_framework.framework_output))) {
asprintf(&dbgvalue, "PMIX_DEBUG=%d", dbg);
putenv(dbgvalue);
@ -106,13 +94,20 @@ int pmix1_client_init(void)
if (NULL != getenv(OPAL_MCA_PREFIX"orte_launch")) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
native_launch = true;
mca_pmix_pmix1xx_component.native_launch = true;
opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace);
} else {
/* we were launched by someone else, so make the
* jobid just be the hash of the nspace */
OPAL_HASH_STR(my_proc.nspace, pname.jobid);
}
/* insert this into our list of jobids - it will be the
* first, and so we'll check it first */
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, my_proc.nspace, PMIX_MAX_NSLEN);
job->jobid = pname.jobid;
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
pname.vpid = my_proc.rank;
opal_proc_set_name(&pname);
@ -134,8 +129,6 @@ int pmix1_client_finalize(void)
rc = PMIx_Finalize();
OPAL_LIST_DESTRUCT(&jobids);
return pmix1_convert_rc(rc);
}
@ -157,7 +150,7 @@ int pmix1_abort(int flag, const char *msg,
pmix_proc_t *parray=NULL;
size_t n, cnt=0;
opal_namelist_t *ptr;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"PMIx_client abort");
@ -168,20 +161,19 @@ int pmix1_abort(int flag, const char *msg,
PMIX_PROC_CREATE(parray, cnt);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == ptr->name.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
parray[n].rank = ptr->name.vpid;
++n;
}
@ -201,23 +193,22 @@ int pmix1_store_local(const opal_process_name_t *proc, opal_value_t *val)
pmix_value_t kv;
pmix_status_t rc;
pmix_proc_t p;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
if (NULL != proc) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == proc->jobid) {
(void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == proc->jobid) {
(void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == proc->jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN);
p.rank = proc->vpid;
} else {
/* use our name */
@ -259,7 +250,7 @@ int pmix1_fence(opal_list_t *procs, int collect_data)
size_t n, cnt=0;
opal_namelist_t *ptr;
pmix_info_t info, *iptr;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"PMIx_client fence");
@ -270,20 +261,19 @@ int pmix1_fence(opal_list_t *procs, int collect_data)
PMIX_PROC_CREATE(parray, cnt);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == ptr->name.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
parray[n].rank = ptr->name.vpid;
++n;
}
@ -322,7 +312,7 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
opal_namelist_t *ptr;
pmix1_opcaddy_t *op;
pmix_info_t info, *iptr;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"PMIx_client fence_nb");
@ -333,20 +323,19 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
PMIX_PROC_CREATE(parray, cnt);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == ptr->name.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
parray[n].rank = ptr->name.vpid;
++n;
}
@ -406,7 +395,7 @@ int pmix1_get(const opal_process_name_t *proc, const char *key,
size_t ninfo, n;
pmix_info_t *pinfo;
opal_value_t *ival;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"%s PMIx_client get on proc %s key %s",
@ -416,20 +405,19 @@ int pmix1_get(const opal_process_name_t *proc, const char *key,
/* prep default response */
*val = NULL;
if (NULL != proc) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == proc->jobid) {
(void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == proc->jobid) {
(void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == proc->jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN);
p.rank = proc->vpid;
pptr = &p;
} else {
@ -509,7 +497,7 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
pmix_status_t rc;
size_t n;
opal_value_t *ival;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"%s PMIx_client get_nb on proc %s key %s",
@ -522,20 +510,19 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
op->cbdata = cbdata;
if (NULL != proc) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == proc->jobid) {
(void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == proc->jobid) {
(void)strncpy(op->p.nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == proc->jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(op->p.nspace, job->nspace, PMIX_MAX_NSLEN);
op->p.rank = proc->vpid;
} else {
(void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN);
@ -640,6 +627,7 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info)
pmix_status_t ret;
opal_pmix_pdata_t *d;
opal_value_t *iptr;
opal_pmix1_jobid_trkr_t *job, *jptr;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"PMIx_client lookup");
@ -676,7 +664,7 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info)
/* transfer the data back */
n=0;
OPAL_LIST_FOREACH(d, data, opal_pmix_pdata_t) {
if (native_launch) {
if (mca_pmix_pmix1xx_component.native_launch) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
opal_convert_string_to_jobid(&d->proc.jobid, pdata[n].proc.nspace);
@ -685,6 +673,20 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info)
* jobid just be the hash of the nspace */
OPAL_HASH_STR(pdata[n].proc.nspace, d->proc.jobid);
}
/* if we don't already have it, add this to our jobid tracker */
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == d->proc.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, pdata[n].proc.nspace, PMIX_MAX_NSLEN);
job->jobid = d->proc.jobid;
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
}
if (PMIX_RANK_WILDCARD == pdata[n].proc.rank) {
d->proc.vpid = OPAL_VPID_WILDCARD;
} else {
@ -712,6 +714,7 @@ static void lk_cbfunc(pmix_status_t status,
opal_list_t results, *r;
int rc;
size_t n;
opal_pmix1_jobid_trkr_t *job, *jptr;
if (NULL == op->lkcbfunc) {
OBJ_RELEASE(op);
@ -724,7 +727,7 @@ static void lk_cbfunc(pmix_status_t status,
for (n=0; n < ndata; n++) {
d = OBJ_NEW(opal_pmix_pdata_t);
opal_list_append(&results, &d->super);
if (native_launch) {
if (mca_pmix_pmix1xx_component.native_launch) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
opal_convert_string_to_jobid(&d->proc.jobid, data[n].proc.nspace);
@ -733,6 +736,20 @@ static void lk_cbfunc(pmix_status_t status,
* jobid just be the hash of the nspace */
OPAL_HASH_STR(data[n].proc.nspace, d->proc.jobid);
}
/* if we don't already have it, add this to our jobid tracker */
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == d->proc.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, data[n].proc.nspace, PMIX_MAX_NSLEN);
job->jobid = d->proc.jobid;
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
}
if (PMIX_RANK_WILDCARD == data[n].proc.rank) {
d->proc.vpid = OPAL_VPID_WILDCARD;
} else {
@ -898,7 +915,7 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid)
ret = PMIx_Spawn(pinfo, ninfo, papps, napps, nspace);
if (PMIX_SUCCESS == ret) {
if (native_launch) {
if (mca_pmix_pmix1xx_component.native_launch) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
opal_convert_string_to_jobid(jobid, nspace);
@ -906,12 +923,12 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid)
/* we were launched by someone else, so make the
* jobid just be the hash of the nspace */
OPAL_HASH_STR(nspace, *jobid);
}
/* add this to our jobid tracker */
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN);
job->jobid = *jobid;
opal_list_append(&jobids, &job->super);
}
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
}
PMIX_APP_FREE(papps, napps);
@ -928,7 +945,7 @@ static void spcbfunc(pmix_status_t status,
rc = pmix1_convert_rc(status);
if (PMIX_SUCCESS == status) {
if (native_launch) {
if (mca_pmix_pmix1xx_component.native_launch) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
opal_convert_string_to_jobid(&jobid, nspace);
@ -936,12 +953,12 @@ static void spcbfunc(pmix_status_t status,
/* we were launched by someone else, so make the
* jobid just be the hash of the nspace */
OPAL_HASH_STR(nspace, jobid);
}
/* add this to our jobid tracker */
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN);
job->jobid = jobid;
opal_list_append(&jobids, &job->super);
}
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
}
op->spcbfunc(rc, jobid, op->cbdata);
@ -1004,7 +1021,7 @@ int pmix1_connect(opal_list_t *procs)
pmix_proc_t *parray=NULL;
size_t n, cnt=0;
opal_namelist_t *ptr;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
/* protect against bozo error */
if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) {
@ -1016,20 +1033,20 @@ int pmix1_connect(opal_list_t *procs)
PMIX_PROC_CREATE(parray, cnt);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == ptr->name.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
return OPAL_ERR_NOT_FOUND;
}
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
parray[n].rank = PMIX_RANK_WILDCARD;
} else {
@ -1070,20 +1087,14 @@ int pmix1_connectnb(opal_list_t *procs,
PMIX_PROC_CREATE(op->procs, op->nprocs);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN);
break;
}
}
}
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
op->procs[n].rank = PMIX_RANK_WILDCARD;
} else {
@ -1115,20 +1126,14 @@ int pmix1_disconnect(opal_list_t *procs)
PMIX_PROC_CREATE(parray, cnt);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN);
break;
}
}
}
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
parray[n].rank = PMIX_RANK_WILDCARD;
} else {
@ -1169,20 +1174,14 @@ int pmix1_disconnectnb(opal_list_t *procs,
PMIX_PROC_CREATE(op->procs, op->nprocs);
n=0;
OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) {
(void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == ptr->name.jobid) {
(void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN);
break;
}
}
}
if (OPAL_VPID_WILDCARD == ptr->name.vpid) {
op->procs[n].rank = PMIX_RANK_WILDCARD;
} else {
@ -1206,24 +1205,32 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
opal_namelist_t *nm;
int rc;
pmix_status_t ret;
opal_pmix1_jobid_trkr_t *job, *jptr;
if (OPAL_JOBID_WILDCARD == jobid) {
nspace = NULL;
} else {
nspace = opal_convert_jobid_to_string(jobid);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
nspace = job->nspace;
}
ret = PMIx_Resolve_peers(nodename, nspace, &array, &nprocs);
if (NULL != nspace) {
free(nspace);
}
rc = pmix1_convert_rc(ret);
if (NULL != array && 0 < nprocs) {
for (n=0; n < nprocs; n++) {
nm = OBJ_NEW(opal_namelist_t);
opal_list_append(procs, &nm->super);
if (native_launch) {
if (mca_pmix_pmix1xx_component.native_launch) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
opal_convert_string_to_jobid(&nm->name.jobid, array[n].nspace);
@ -1232,6 +1239,20 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
* jobid just be the hash of the nspace */
OPAL_HASH_STR(array[n].nspace, nm->name.jobid);
}
/* if we don't already have it, add this to our jobid tracker */
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == nm->name.jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
job = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN);
job->jobid = jobid;
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super);
}
nm->name.vpid = array[n].rank;
}
}
@ -1244,29 +1265,25 @@ int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist)
{
pmix_status_t ret;
char *nspace=NULL;
opal_pmix1_jobid_trkr_t *job;
opal_pmix1_jobid_trkr_t *job, *jptr;
if (OPAL_JOBID_WILDCARD != jobid) {
/* if the jobid is my own, then we can just use
* my namespace */
if (OPAL_PROC_MY_NAME.jobid == jobid) {
nspace = strdup(my_proc.nspace);
} else {
/* look thru our list of jobids and find the
* corresponding nspace */
OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) {
if (job->jobid == jobid) {
nspace = strdup(job->nspace);
job = NULL;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == jobid) {
job = jptr;
break;
}
}
if (NULL == job) {
return OPAL_ERR_NOT_FOUND;
}
nspace = job->nspace;
}
ret = PMIx_Resolve_nodes(nspace, nodelist);
if (NULL != nspace) {
free(nspace);
}
return pmix1_convert_rc(ret);;
}

Просмотреть файл

@ -44,6 +44,8 @@
/* These are functions used by both client and server to
* access common functions in the embedded PMIx library */
static const char *pmix1_get_nspace(opal_jobid_t jobid);
static void pmix1_register_jobid(opal_jobid_t jobid, const char *nspace);
const opal_pmix_base_module_t opal_pmix_pmix1xx_module = {
/* client APIs */
@ -85,9 +87,39 @@ const opal_pmix_base_module_t opal_pmix_pmix1xx_module = {
PMIx_Get_version,
opal_pmix_base_register_handler,
opal_pmix_base_deregister_handler,
pmix1_store_local
pmix1_store_local,
pmix1_get_nspace,
pmix1_register_jobid
};
static const char *pmix1_get_nspace(opal_jobid_t jobid)
{
opal_pmix1_jobid_trkr_t *jptr;
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == jobid) {
return jptr->nspace;
}
}
return NULL;
}
static void pmix1_register_jobid(opal_jobid_t jobid, const char *nspace)
{
opal_pmix1_jobid_trkr_t *jptr;
/* if we don't already have it, add this to our jobid tracker */
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) {
if (jptr->jobid == jobid) {
return;
}
}
jptr = OBJ_NEW(opal_pmix1_jobid_trkr_t);
(void)strncpy(jptr->nspace, nspace, PMIX_MAX_NSLEN);
jptr->jobid = jobid;
opal_list_append(&mca_pmix_pmix1xx_component.jobids, &jptr->super);
}
pmix_status_t pmix1_convert_opalrc(int rc)
{
switch (rc) {
@ -436,6 +468,10 @@ int pmix1_value_unload(opal_value_t *kv,
/**** INSTANTIATE INTERNAL CLASSES ****/
OBJ_CLASS_INSTANCE(opal_pmix1_jobid_trkr_t,
opal_list_item_t,
NULL, NULL);
static void opcon(pmix1_opcaddy_t *p)
{
memset(&p->p, 0, sizeof(pmix_proc_t));

Просмотреть файл

@ -18,6 +18,7 @@
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/class/opal_list.h"
#include "opal/util/proc.h"
#include "opal/mca/pmix/pmix.h"
#include "pmix1.h"
@ -41,7 +42,8 @@ static int pmix1xx_component_query(mca_base_module_t **module, int *priority);
* and pointers to our public functions in it
*/
opal_pmix_base_component_t mca_pmix_pmix1xx_component = {
mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
@ -68,16 +70,19 @@ opal_pmix_base_component_t mca_pmix_pmix1xx_component = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
},
.native_launch = false
};
static int pmix1xx_open(void)
{
OBJ_CONSTRUCT(&mca_pmix_pmix1xx_component.jobids, opal_list_t);
return OPAL_SUCCESS;
}
static int pmix1xx_close(void)
{
OPAL_LIST_DESTRUCT(&mca_pmix_pmix1xx_component.jobids);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -49,6 +49,8 @@ static int s1_job_connect(opal_list_t *procs);
static int s1_job_disconnect(opal_list_t *procs);
static int s1_store_local(const opal_process_name_t *proc,
opal_value_t *val);
static const char *s1_get_nspace(opal_jobid_t jobid);
static void s1_register_jobid(opal_jobid_t jobid, const char *nspace);
const opal_pmix_base_module_t opal_pmix_s1_module = {
s1_init,
@ -89,7 +91,9 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
NULL,
opal_pmix_base_register_handler,
opal_pmix_base_deregister_handler,
s1_store_local
s1_store_local,
s1_get_nspace,
s1_register_jobid
};
// usage accounting
@ -644,6 +648,14 @@ static int s1_store_local(const opal_process_name_t *proc,
return OPAL_SUCCESS;
}
static const char *s1_get_nspace(opal_jobid_t jobid)
{
return NULL;
}
static void s1_register_jobid(opal_jobid_t jobid, const char *nspace)
{
return;
}
static char* pmix_error(int pmix_err)
{

Просмотреть файл

@ -56,6 +56,8 @@ static int s2_job_connect(opal_list_t *procs);
static int s2_job_disconnect(opal_list_t *procs);
static int s2_store_local(const opal_process_name_t *proc,
opal_value_t *val);
static const char *s2_get_nspace(opal_jobid_t jobid);
static void s2_register_jobid(opal_jobid_t jobid, const char *nspace);
const opal_pmix_base_module_t opal_pmix_s2_module = {
s2_init,
@ -96,7 +98,9 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
NULL,
opal_pmix_base_register_handler,
opal_pmix_base_deregister_handler,
s2_store_local
s2_store_local,
s2_get_nspace,
s2_register_jobid
};
// usage accounting
@ -663,6 +667,14 @@ static int s2_store_local(const opal_process_name_t *proc,
return OPAL_SUCCESS;
}
static const char *s2_get_nspace(opal_jobid_t jobid)
{
return NULL;
}
static void s2_register_jobid(opal_jobid_t jobid, const char *nspace)
{
return;
}
static char* pmix_error(int pmix_err)
{