1
1

Have rank=1 daemon always send its topology back as this is the most common use-case

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-01-25 09:33:11 -08:00
родитель 230bbc597d
Коммит 2f4e87eae9
2 изменённых файлов: 46 добавлений и 20 удалений

Просмотреть файл

@ -991,6 +991,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
opal_buffer_t *relay;
char *sig;
orte_topology_t *t;
hwloc_topology_t topo;
int i;
bool found;
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
@ -1126,6 +1127,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s RECEIVED TOPOLOGY SIG %s FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sig, nodename));
/* rank=1 always sends its topology back */
topo = NULL;
if (1 == sender->vpid) {
idx=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
}
/* do we already have this topology from some other node? */
found = false;
for (i=0; i < orte_node_topologies->size; i++) {
@ -1139,6 +1152,9 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
found = true;
node->topology = t;
if (NULL != topo) {
hwloc_topology_destroy(topo);
}
free(sig);
break;
}
@ -1152,27 +1168,31 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
t->sig = sig;
opal_pointer_array_add(orte_node_topologies, t);
node->topology = t;
/* construct the request */
relay = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(relay);
orted_failed_launch = true;
goto CLEANUP;
if (NULL != topo) {
t->topo = topo;
} else {
/* construct the request */
relay = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(relay);
orted_failed_launch = true;
goto CLEANUP;
}
/* send it */
orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, relay,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
/* we will count this node as completed
* when we get the full topology back */
if (NULL != nodename) {
free(nodename);
nodename = NULL;
}
idx = 1;
continue;
}
/* send it */
orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, relay,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
/* we will count this node as completed
* when we get the full topology back */
if (NULL != nodename) {
free(nodename);
nodename = NULL;
}
idx = 1;
continue;
}
CLEANUP:

Просмотреть файл

@ -759,6 +759,12 @@ int orte_daemon(int argc, char *argv[])
ORTE_ERROR_LOG(ret);
}
/* if we are rank=1, then send our topology back - otherwise, mpirun
* will request it if necessary */
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(ret);
}
/* send to the HNP's callback - will be routed if routes are available */
if (0 > (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_HNP, buffer,