Restore the passing of the complete job map to the local proc on first get_attr so the info can be used by the MPI layer without continual calls back to the server. We'll find a more memory efficient method later.
Этот коммит содержится в:
родитель
9e6b157cb6
Коммит
9658256a98
@ -1027,7 +1027,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
||||
uint32_t i, myrank;
|
||||
opal_process_name_t id;
|
||||
char *cpuset;
|
||||
opal_buffer_t buf;
|
||||
opal_buffer_t buf, buf2;
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s pmix:native get_attr called",
|
||||
@ -1166,7 +1166,95 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
||||
}
|
||||
cnt=1;
|
||||
continue;
|
||||
} else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) {
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s received proc map",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
||||
/* transfer the byte object for unpacking */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
|
||||
kp->data.bo.bytes = NULL; // protect the data region
|
||||
kp->data.bo.size = 0;
|
||||
OBJ_RELEASE(kp);
|
||||
/* get the jobid */
|
||||
cnt=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
cnt = 1;
|
||||
return false;
|
||||
}
|
||||
if (0 != strcmp(PMIX_JOBID, kp->key)) {
|
||||
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
OBJ_RELEASE(kp);
|
||||
cnt = 1;
|
||||
return false;
|
||||
}
|
||||
id.jobid = kp->data.uint32;
|
||||
OBJ_RELEASE(kp);
|
||||
/* unpack the data for each rank */
|
||||
cnt=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||
if (0 != strcmp(PMIX_RANK, kp->key)) {
|
||||
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
OBJ_RELEASE(kp);
|
||||
cnt = 1;
|
||||
return false;
|
||||
}
|
||||
id.vpid = kp->data.uint32;
|
||||
/* unpack the blob for this rank */
|
||||
cnt=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
cnt = 1;
|
||||
return false;
|
||||
}
|
||||
if (0 != strcmp(PMIX_PROC_MAP, kp->key)) {
|
||||
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
OBJ_RELEASE(kp);
|
||||
cnt = 1;
|
||||
return false;
|
||||
}
|
||||
/* transfer the byte object for unpacking */
|
||||
OBJ_CONSTRUCT(&buf2, opal_buffer_t);
|
||||
opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size);
|
||||
kp->data.bo.bytes = NULL; // protect the data region
|
||||
kp->data.bo.size = 0;
|
||||
OBJ_RELEASE(kp);
|
||||
/* unpack and store the map */
|
||||
cnt=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) {
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s storing key %s for peer %s",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
kp->key, OPAL_NAME_PRINT(id));
|
||||
if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(kp);
|
||||
OBJ_DESTRUCT(&buf2);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&buf2);
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
return false;
|
||||
}
|
||||
cnt=1;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
return false;
|
||||
}
|
||||
cnt=1;
|
||||
continue;
|
||||
}
|
||||
/* otherwise, it is a single piece of info, so store it */
|
||||
if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(kp);
|
||||
@ -1216,6 +1304,8 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
||||
|
||||
/* baseline all the procs as nonlocal */
|
||||
myrank = native_pname.vpid;
|
||||
id.jobid = native_pname.jobid;
|
||||
|
||||
/* we only need to set locality for each local rank as "not found"
|
||||
* equates to "non local" */
|
||||
ranks = opal_argv_split(lclpeers->data.string, ',');
|
||||
@ -1224,15 +1314,15 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
||||
if (myrank == vid) {
|
||||
continue;
|
||||
}
|
||||
native_pname.vpid = vid;
|
||||
id.vpid = vid;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
OBJ_CONSTRUCT(&vals, opal_list_t);
|
||||
if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname,
|
||||
if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id,
|
||||
OPAL_DSTORE_CPUSET, &vals))) {
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s cpuset for local proc %s not found",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
OPAL_NAME_PRINT(native_pname));
|
||||
OPAL_NAME_PRINT(id));
|
||||
OPAL_LIST_DESTRUCT(&vals);
|
||||
/* even though the cpuset wasn't found, we at least know it is
|
||||
* on the same node with us */
|
||||
@ -1259,14 +1349,14 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
|
||||
"%s pmix:native proc %s locality %s",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
OPAL_NAME_PRINT(native_pname),
|
||||
OPAL_NAME_PRINT(id),
|
||||
opal_hwloc_base_print_locality(locality)));
|
||||
|
||||
OBJ_CONSTRUCT(&kvn, opal_value_t);
|
||||
kvn.key = strdup(OPAL_DSTORE_LOCALITY);
|
||||
kvn.type = OPAL_UINT16;
|
||||
kvn.data.uint16 = locality;
|
||||
(void)opal_dstore.store(opal_dstore_internal, &native_pname, &kvn);
|
||||
(void)opal_dstore.store(opal_dstore_internal, &id, &kvn);
|
||||
OBJ_DESTRUCT(&kvn);
|
||||
}
|
||||
opal_argv_free(ranks);
|
||||
|
@ -333,7 +333,7 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
||||
int i;
|
||||
char **list;
|
||||
orte_process_name_t name;
|
||||
opal_buffer_t buf;
|
||||
opal_buffer_t buf, buf2;
|
||||
|
||||
/* convenience def */
|
||||
node = proc->node;
|
||||
@ -388,17 +388,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* appnum */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_APPNUM);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = proc->app_idx;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_RANK);
|
||||
@ -409,29 +398,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* global rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_GLOBAL_RANK);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = proc->name.vpid + jdata->offset;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* app rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_APP_RANK);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = proc->app_rank;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* offset */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_NPROC_OFFSET);
|
||||
@ -443,12 +409,75 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* pass a blob - for each proc in this job, include the info describing
|
||||
* it so the recipient has a complete picture */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* jobid, for simplicity when unpacking */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_JOBID);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = proc->name.jobid;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_RANK);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = pptr->name.vpid;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* create the buffer for this rank */
|
||||
OBJ_CONSTRUCT(&buf2, opal_buffer_t);
|
||||
/* appnum */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_APPNUM);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = pptr->app_idx;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* global rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_GLOBAL_RANK);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = pptr->name.vpid + jdata->offset;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* app rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_APP_RANK);
|
||||
kv.type = OPAL_UINT32;
|
||||
kv.data.uint32 = pptr->app_rank;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* local rank */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_LOCAL_RANK);
|
||||
kv.type = OPAL_UINT16;
|
||||
kv.data.uint16 = proc->local_rank;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||
kv.data.uint16 = pptr->local_rank;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
@ -458,7 +487,32 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_NODE_RANK);
|
||||
kv.type = OPAL_UINT16;
|
||||
kv.data.uint16 = proc->node_rank;
|
||||
kv.data.uint16 = pptr->node_rank;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* add the rank's blob */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_PROC_MAP);
|
||||
kv.type = OPAL_BYTE_OBJECT;
|
||||
opal_dss.unload(&buf2, (void**)&kv.data.bo.bytes, &kv.data.bo.size);
|
||||
OBJ_DESTRUCT(&buf2);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
}
|
||||
/* now pass the blob as the proc-map key */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_PROC_MAP);
|
||||
kv.type = OPAL_BYTE_OBJECT;
|
||||
opal_dss.unload(&buf, (void**)&kv.data.bo.bytes, &kv.data.bo.size);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user