Finally fix the problem - the key was knowing there were more than 2 topologies involved, and that the HNP is not allocated. Give up on being cute and just search the darned list of topologies - there won't be that many, and if there are (so the scan takes awhile), then too bad.
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
3b29b78a19
Коммит
f47124e4d3
@ -483,6 +483,7 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
orte_node_t *nptr;
|
orte_node_t *nptr;
|
||||||
int rc;
|
int rc;
|
||||||
uint8_t ui8;
|
uint8_t ui8;
|
||||||
|
orte_topology_t *ortetopo;
|
||||||
|
|
||||||
/* setup the list of results */
|
/* setup the list of results */
|
||||||
OBJ_CONSTRUCT(&slots, opal_list_t);
|
OBJ_CONSTRUCT(&slots, opal_list_t);
|
||||||
@ -515,13 +516,40 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* there is always one topology - our own - so start with it */
|
/* handle the topologies - as the most common case by far
|
||||||
|
* is to have homogeneous topologies, we only send them
|
||||||
|
* if something is different. We know that the HNP is
|
||||||
|
* the first topology, and that any differing topology
|
||||||
|
* on the compute nodes must follow. So send the topologies
|
||||||
|
* if and only if:
|
||||||
|
*
|
||||||
|
* (a) the HNP is being used to house application procs and
|
||||||
|
* there is more than one topology on our list; or
|
||||||
|
*
|
||||||
|
* (b) the HNP is not being used, but there are more than
|
||||||
|
* two topologies on our list, thus indicating that
|
||||||
|
* there are multiple topologies on the compute nodes
|
||||||
|
*/
|
||||||
nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||||
|
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||||
|
/* assign a NULL topology so we still account for our presence,
|
||||||
|
* but don't cause us to send topology info when not needed */
|
||||||
|
tp = OBJ_NEW(orte_regex_range_t);
|
||||||
|
tp->t = NULL;
|
||||||
|
tp->cnt = 1;
|
||||||
|
} else {
|
||||||
|
/* there is always one topology - our own - so start with it */
|
||||||
tp = OBJ_NEW(orte_regex_range_t);
|
tp = OBJ_NEW(orte_regex_range_t);
|
||||||
tp->t = nptr->topology;
|
tp->t = nptr->topology;
|
||||||
tp->cnt = 1;
|
tp->cnt = 1;
|
||||||
|
}
|
||||||
opal_list_append(&topos, &tp->super);
|
opal_list_append(&topos, &tp->super);
|
||||||
|
|
||||||
|
opal_output_verbose(5, orte_nidmap_output,
|
||||||
|
"%s STARTING WITH TOPOLOGY FOR NODE %s: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
nptr->name, (NULL == tp->t) ? "NULL" : tp->t->sig);
|
||||||
|
|
||||||
/* likewise, we have slots */
|
/* likewise, we have slots */
|
||||||
slt = OBJ_NEW(orte_regex_range_t);
|
slt = OBJ_NEW(orte_regex_range_t);
|
||||||
slt->slots = nptr->slots;
|
slt->slots = nptr->slots;
|
||||||
@ -554,22 +582,33 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
opal_list_append(&slots, &slt->super);
|
opal_list_append(&slots, &slt->super);
|
||||||
}
|
}
|
||||||
/* check the topologies */
|
/* check the topologies */
|
||||||
if (NULL == nptr->topology) {
|
if (NULL != tp->t && NULL == nptr->topology) {
|
||||||
/* we don't know this topology, likely because
|
/* we don't know this topology, likely because
|
||||||
* we don't have a daemon on the node */
|
* we don't have a daemon on the node */
|
||||||
tp = OBJ_NEW(orte_regex_range_t);
|
tp = OBJ_NEW(orte_regex_range_t);
|
||||||
tp->t = NULL;
|
tp->t = NULL;
|
||||||
tp->cnt = 1;
|
tp->cnt = 1;
|
||||||
|
opal_output_verbose(5, orte_nidmap_output,
|
||||||
|
"%s ADD TOPOLOGY FOR NODE %s: NULL",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nptr->name);
|
||||||
opal_list_append(&topos, &tp->super);
|
opal_list_append(&topos, &tp->super);
|
||||||
} else {
|
} else {
|
||||||
/* is this the next in line */
|
/* is this the next in line */
|
||||||
if (tp->t == nptr->topology) {
|
if (tp->t == nptr->topology) {
|
||||||
tp->cnt++;
|
tp->cnt++;
|
||||||
|
opal_output_verbose(5, orte_nidmap_output,
|
||||||
|
"%s CONTINUE TOPOLOGY RANGE (%d) WITH NODE %s: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
tp->cnt, nptr->name, tp->t->sig);
|
||||||
} else {
|
} else {
|
||||||
/* need to start another range */
|
/* need to start another range */
|
||||||
tp = OBJ_NEW(orte_regex_range_t);
|
tp = OBJ_NEW(orte_regex_range_t);
|
||||||
tp->t = nptr->topology;
|
tp->t = nptr->topology;
|
||||||
tp->cnt = 1;
|
tp->cnt = 1;
|
||||||
|
opal_output_verbose(5, orte_nidmap_output,
|
||||||
|
"%s STARTING NEW TOPOLOGY RANGE WITH NODE %s: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
nptr->name, tp->t->sig);
|
||||||
opal_list_append(&topos, &tp->super);
|
opal_list_append(&topos, &tp->super);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -645,31 +684,32 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
free(tmp);
|
free(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* handle the topologies - as the most common case by far
|
/* don't try to be cute - there aren't going to be that many
|
||||||
* is to have homogeneous topologies, we only send them
|
* topologies, so just scan the list and see if they are the
|
||||||
* if something is different. We know that the HNP is
|
* same, excluding any NULL values */
|
||||||
* the first topology, and that any differing topology
|
ortetopo = NULL;
|
||||||
* on the compute nodes must follow. So send the topologies
|
test = false;
|
||||||
* if and only if:
|
OPAL_LIST_FOREACH(rng, &topos, orte_regex_range_t) {
|
||||||
*
|
if (NULL == rng->t) {
|
||||||
* (a) the HNP is being used to house application procs and
|
continue;
|
||||||
* there is more than one topology on our list; or
|
}
|
||||||
*
|
if (NULL == ortetopo) {
|
||||||
* (b) the HNP is not being used, but there are more than
|
ortetopo = rng->t;
|
||||||
* two topologies on our list, thus indicating that
|
} else if (0 != strcmp(ortetopo->sig, rng->t->sig)) {
|
||||||
* there are multiple topologies on the compute nodes
|
/* we have a difference, so send them */
|
||||||
*/
|
test = true;
|
||||||
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
}
|
||||||
/* remove the first topo on the list */
|
|
||||||
item = opal_list_remove_first(&topos);
|
|
||||||
OBJ_RELEASE(item);
|
|
||||||
}
|
}
|
||||||
tmp = NULL;
|
tmp = NULL;
|
||||||
if (1 < opal_list_get_size(&topos)) {
|
if (test) {
|
||||||
opal_buffer_t bucket, *bptr;
|
opal_buffer_t bucket, *bptr;
|
||||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||||
while (NULL != (item = opal_list_remove_first(&topos))) {
|
while (NULL != (item = opal_list_remove_first(&topos))) {
|
||||||
rng = (orte_regex_range_t*)item;
|
rng = (orte_regex_range_t*)item;
|
||||||
|
opal_output_verbose(5, orte_nidmap_output,
|
||||||
|
"%s PASSING TOPOLOGY %s RANGE %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(NULL == rng->t) ? "NULL" : rng->t->sig, rng->cnt);
|
||||||
if (NULL == tmp) {
|
if (NULL == tmp) {
|
||||||
asprintf(&tmp, "%d", rng->cnt);
|
asprintf(&tmp, "%d", rng->cnt);
|
||||||
} else {
|
} else {
|
||||||
@ -738,6 +778,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&bucket);
|
OBJ_DESTRUCT(&bucket);
|
||||||
} else {
|
} else {
|
||||||
|
opal_output_verbose(1, orte_nidmap_output,
|
||||||
|
"%s NOT PASSING TOPOLOGIES",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
/* need to pack the NULL just to terminate the region */
|
/* need to pack the NULL just to terminate the region */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user