Restore the passing of the complete job map to the local proc on first get_attr so the info can be used by the MPI layer without continual calls back to the server. We'll find a more memory efficient method later.
Этот коммит содержится в:
родитель
9e6b157cb6
Коммит
9658256a98
@ -1027,7 +1027,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
|||||||
uint32_t i, myrank;
|
uint32_t i, myrank;
|
||||||
opal_process_name_t id;
|
opal_process_name_t id;
|
||||||
char *cpuset;
|
char *cpuset;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf, buf2;
|
||||||
|
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s pmix:native get_attr called",
|
"%s pmix:native get_attr called",
|
||||||
@ -1166,7 +1166,95 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
|||||||
}
|
}
|
||||||
cnt=1;
|
cnt=1;
|
||||||
continue;
|
continue;
|
||||||
|
} else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) {
|
||||||
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
|
"%s received proc map",
|
||||||
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
||||||
|
/* transfer the byte object for unpacking */
|
||||||
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
|
opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
|
||||||
|
kp->data.bo.bytes = NULL; // protect the data region
|
||||||
|
kp->data.bo.size = 0;
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
/* get the jobid */
|
||||||
|
cnt=1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
cnt = 1;
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
if (0 != strcmp(PMIX_JOBID, kp->key)) {
|
||||||
|
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
cnt = 1;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
id.jobid = kp->data.uint32;
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
/* unpack the data for each rank */
|
||||||
|
cnt=1;
|
||||||
|
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||||
|
if (0 != strcmp(PMIX_RANK, kp->key)) {
|
||||||
|
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
cnt = 1;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
id.vpid = kp->data.uint32;
|
||||||
|
/* unpack the blob for this rank */
|
||||||
|
cnt=1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
cnt = 1;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (0 != strcmp(PMIX_PROC_MAP, kp->key)) {
|
||||||
|
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
cnt = 1;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
/* transfer the byte object for unpacking */
|
||||||
|
OBJ_CONSTRUCT(&buf2, opal_buffer_t);
|
||||||
|
opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size);
|
||||||
|
kp->data.bo.bytes = NULL; // protect the data region
|
||||||
|
kp->data.bo.size = 0;
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
/* unpack and store the map */
|
||||||
|
cnt=1;
|
||||||
|
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) {
|
||||||
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
|
"%s storing key %s for peer %s",
|
||||||
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
|
kp->key, OPAL_NAME_PRINT(id));
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(kp);
|
||||||
|
OBJ_DESTRUCT(&buf2);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&buf2);
|
||||||
|
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cnt=1;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
|
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cnt=1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* otherwise, it is a single piece of info, so store it */
|
||||||
if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) {
|
if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) {
|
||||||
OPAL_ERROR_LOG(rc);
|
OPAL_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(kp);
|
OBJ_RELEASE(kp);
|
||||||
@ -1216,6 +1304,8 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
|||||||
|
|
||||||
/* baseline all the procs as nonlocal */
|
/* baseline all the procs as nonlocal */
|
||||||
myrank = native_pname.vpid;
|
myrank = native_pname.vpid;
|
||||||
|
id.jobid = native_pname.jobid;
|
||||||
|
|
||||||
/* we only need to set locality for each local rank as "not found"
|
/* we only need to set locality for each local rank as "not found"
|
||||||
* equates to "non local" */
|
* equates to "non local" */
|
||||||
ranks = opal_argv_split(lclpeers->data.string, ',');
|
ranks = opal_argv_split(lclpeers->data.string, ',');
|
||||||
@ -1224,15 +1314,15 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
|||||||
if (myrank == vid) {
|
if (myrank == vid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
native_pname.vpid = vid;
|
id.vpid = vid;
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
OBJ_CONSTRUCT(&vals, opal_list_t);
|
OBJ_CONSTRUCT(&vals, opal_list_t);
|
||||||
if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname,
|
if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id,
|
||||||
OPAL_DSTORE_CPUSET, &vals))) {
|
OPAL_DSTORE_CPUSET, &vals))) {
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s cpuset for local proc %s not found",
|
"%s cpuset for local proc %s not found",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
OPAL_NAME_PRINT(native_pname));
|
OPAL_NAME_PRINT(id));
|
||||||
OPAL_LIST_DESTRUCT(&vals);
|
OPAL_LIST_DESTRUCT(&vals);
|
||||||
/* even though the cpuset wasn't found, we at least know it is
|
/* even though the cpuset wasn't found, we at least know it is
|
||||||
* on the same node with us */
|
* on the same node with us */
|
||||||
@ -1259,14 +1349,14 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
|||||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
|
||||||
"%s pmix:native proc %s locality %s",
|
"%s pmix:native proc %s locality %s",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
OPAL_NAME_PRINT(native_pname),
|
OPAL_NAME_PRINT(id),
|
||||||
opal_hwloc_base_print_locality(locality)));
|
opal_hwloc_base_print_locality(locality)));
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&kvn, opal_value_t);
|
OBJ_CONSTRUCT(&kvn, opal_value_t);
|
||||||
kvn.key = strdup(OPAL_DSTORE_LOCALITY);
|
kvn.key = strdup(OPAL_DSTORE_LOCALITY);
|
||||||
kvn.type = OPAL_UINT16;
|
kvn.type = OPAL_UINT16;
|
||||||
kvn.data.uint16 = locality;
|
kvn.data.uint16 = locality;
|
||||||
(void)opal_dstore.store(opal_dstore_internal, &native_pname, &kvn);
|
(void)opal_dstore.store(opal_dstore_internal, &id, &kvn);
|
||||||
OBJ_DESTRUCT(&kvn);
|
OBJ_DESTRUCT(&kvn);
|
||||||
}
|
}
|
||||||
opal_argv_free(ranks);
|
opal_argv_free(ranks);
|
||||||
|
@ -333,7 +333,7 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
|||||||
int i;
|
int i;
|
||||||
char **list;
|
char **list;
|
||||||
orte_process_name_t name;
|
orte_process_name_t name;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf, buf2;
|
||||||
|
|
||||||
/* convenience def */
|
/* convenience def */
|
||||||
node = proc->node;
|
node = proc->node;
|
||||||
@ -388,17 +388,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&kv);
|
OBJ_DESTRUCT(&kv);
|
||||||
/* appnum */
|
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
||||||
kv.key = strdup(PMIX_APPNUM);
|
|
||||||
kv.type = OPAL_UINT32;
|
|
||||||
kv.data.uint32 = proc->app_idx;
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
/* rank */
|
/* rank */
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
kv.key = strdup(PMIX_RANK);
|
kv.key = strdup(PMIX_RANK);
|
||||||
@ -409,29 +398,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
|||||||
OBJ_DESTRUCT(&kv);
|
OBJ_DESTRUCT(&kv);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
/* global rank */
|
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
||||||
kv.key = strdup(PMIX_GLOBAL_RANK);
|
|
||||||
kv.type = OPAL_UINT32;
|
|
||||||
kv.data.uint32 = proc->name.vpid + jdata->offset;
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
/* app rank */
|
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
||||||
kv.key = strdup(PMIX_APP_RANK);
|
|
||||||
kv.type = OPAL_UINT32;
|
|
||||||
kv.data.uint32 = proc->app_rank;
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&kv);
|
|
||||||
/* offset */
|
/* offset */
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
kv.key = strdup(PMIX_NPROC_OFFSET);
|
kv.key = strdup(PMIX_NPROC_OFFSET);
|
||||||
@ -443,12 +409,75 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&kv);
|
OBJ_DESTRUCT(&kv);
|
||||||
|
/* pass a blob - for each proc in this job, include the info describing
|
||||||
|
* it so the recipient has a complete picture */
|
||||||
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
|
/* jobid, for simplicity when unpacking */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_JOBID);
|
||||||
|
kv.type = OPAL_UINT32;
|
||||||
|
kv.data.uint32 = proc->name.jobid;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
for (i=0; i < jdata->procs->size; i++) {
|
||||||
|
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* rank */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_RANK);
|
||||||
|
kv.type = OPAL_UINT32;
|
||||||
|
kv.data.uint32 = pptr->name.vpid;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
/* create the buffer for this rank */
|
||||||
|
OBJ_CONSTRUCT(&buf2, opal_buffer_t);
|
||||||
|
/* appnum */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_APPNUM);
|
||||||
|
kv.type = OPAL_UINT32;
|
||||||
|
kv.data.uint32 = pptr->app_idx;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
/* global rank */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_GLOBAL_RANK);
|
||||||
|
kv.type = OPAL_UINT32;
|
||||||
|
kv.data.uint32 = pptr->name.vpid + jdata->offset;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
/* app rank */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_APP_RANK);
|
||||||
|
kv.type = OPAL_UINT32;
|
||||||
|
kv.data.uint32 = pptr->app_rank;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
/* local rank */
|
/* local rank */
|
||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
kv.key = strdup(PMIX_LOCAL_RANK);
|
kv.key = strdup(PMIX_LOCAL_RANK);
|
||||||
kv.type = OPAL_UINT16;
|
kv.type = OPAL_UINT16;
|
||||||
kv.data.uint16 = proc->local_rank;
|
kv.data.uint16 = pptr->local_rank;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_DESTRUCT(&kv);
|
OBJ_DESTRUCT(&kv);
|
||||||
return rc;
|
return rc;
|
||||||
@ -458,7 +487,32 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
|||||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
kv.key = strdup(PMIX_NODE_RANK);
|
kv.key = strdup(PMIX_NODE_RANK);
|
||||||
kv.type = OPAL_UINT16;
|
kv.type = OPAL_UINT16;
|
||||||
kv.data.uint16 = proc->node_rank;
|
kv.data.uint16 = pptr->node_rank;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
/* add the rank's blob */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_PROC_MAP);
|
||||||
|
kv.type = OPAL_BYTE_OBJECT;
|
||||||
|
opal_dss.unload(&buf2, (void**)&kv.data.bo.bytes, &kv.data.bo.size);
|
||||||
|
OBJ_DESTRUCT(&buf2);
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&kv);
|
||||||
|
}
|
||||||
|
/* now pass the blob as the proc-map key */
|
||||||
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||||
|
kv.key = strdup(PMIX_PROC_MAP);
|
||||||
|
kv.type = OPAL_BYTE_OBJECT;
|
||||||
|
opal_dss.unload(&buf, (void**)&kv.data.bo.bytes, &kv.data.bo.size);
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_DESTRUCT(&kv);
|
OBJ_DESTRUCT(&kv);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user