1
1

Per user request, we allow -host to specify a host that is not included in a hostfile (however, we reject it if we were given an allocation by a resource manager). Since we cannot know if an IP addr form references the same node that was previously given as a string name, we have no choice but to assume they are different. Get the topology from the right place in that situation so mpirun can succeed.

Этот коммит содержится в:
Ralph Castain 2015-03-25 06:16:01 -07:00
родитель 9dbc69df0f
Коммит 6ba76ed8d8

Просмотреть файл

@ -86,15 +86,29 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC
{ {
hwloc_topology_t t; hwloc_topology_t t;
orte_job_t *jdata;
orte_node_t *node; orte_node_t *node;
orte_proc_t *dmn1;
int i; int i;
/* if we got back topology info from the first node, then we use /* if we got back topology info from the first node, then we use
* it as the "standard" for all other nodes unless they sent * it as the "standard" for all other nodes unless they sent
* back their own topology */ * back their own topology */
if (1 < orte_process_info.num_procs) { if (1 < orte_process_info.num_procs) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 1)) || /* find daemon.vpid = 1 */
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (NULL == (dmn1 = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 1))) {
/* something is wrong */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
OBJ_RELEASE(caddy);
return;
}
if (NULL == (node = dmn1->node) ||
NULL == (t = node->topology)) { NULL == (t = node->topology)) {
opal_output(0, "NODE %s TOPO %s",
(NULL == node) ? "NULL" : node->name,
(NULL == node) ? "N/A" : ((NULL == t) ? "NULL-T" : "NON-NULL-T"));
/* something is wrong */ /* something is wrong */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
@ -104,7 +118,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setting topo to that from node %s", "%s plm:base:setting topo to that from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
for (i=2; i < orte_node_pool->size; i++) { for (i=1; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue; continue;
} }
@ -756,7 +770,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:orted_report_launch from daemon %s on node %s", "%s plm:base:orted_report_launch from daemon %s on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&dname), nodename)); ORTE_NAME_PRINT(&daemon->name), nodename));
/* look this node up, if necessary */ /* look this node up, if necessary */
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
@ -888,8 +902,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
if (1 == dname.vpid || orte_hetero_nodes) { if (1 == dname.vpid || orte_hetero_nodes) {
/* the user has told us that something is different, so just store it */ /* the user has told us that something is different, so just store it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s ADDING TOPOLOGY PER USER REQUEST", "%s ADDING TOPOLOGY PER USER REQUEST TO NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
t = OBJ_NEW(orte_topology_t); t = OBJ_NEW(orte_topology_t);
/* filter the topology as we'll need it that way later */ /* filter the topology as we'll need it that way later */
opal_hwloc_base_filter_cpus(topo); opal_hwloc_base_filter_cpus(topo);