Per user request, we allow -host to specify a host that is not included in a hostfile (however, we reject it if we were given an allocation by a resource manager). Since we cannot know if an IP addr form references the same node that was previously given as a string name, we have no choice but to assume they are different. Get the topology from the right place in that situation so mpirun can succeed.
Этот коммит содержится в:
родитель
9dbc69df0f
Коммит
6ba76ed8d8
@ -86,15 +86,29 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
|||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
{
|
{
|
||||||
hwloc_topology_t t;
|
hwloc_topology_t t;
|
||||||
|
orte_job_t *jdata;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
|
orte_proc_t *dmn1;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
/* if we got back topology info from the first node, then we use
|
/* if we got back topology info from the first node, then we use
|
||||||
* it as the "standard" for all other nodes unless they sent
|
* it as the "standard" for all other nodes unless they sent
|
||||||
* back their own topology */
|
* back their own topology */
|
||||||
if (1 < orte_process_info.num_procs) {
|
if (1 < orte_process_info.num_procs) {
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 1)) ||
|
/* find daemon.vpid = 1 */
|
||||||
|
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
|
if (NULL == (dmn1 = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 1))) {
|
||||||
|
/* something is wrong */
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
||||||
|
OBJ_RELEASE(caddy);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (NULL == (node = dmn1->node) ||
|
||||||
NULL == (t = node->topology)) {
|
NULL == (t = node->topology)) {
|
||||||
|
opal_output(0, "NODE %s TOPO %s",
|
||||||
|
(NULL == node) ? "NULL" : node->name,
|
||||||
|
(NULL == node) ? "N/A" : ((NULL == t) ? "NULL-T" : "NON-NULL-T"));
|
||||||
/* something is wrong */
|
/* something is wrong */
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
||||||
@ -104,7 +118,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
|||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:setting topo to that from node %s",
|
"%s plm:base:setting topo to that from node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||||
for (i=2; i < orte_node_pool->size; i++) {
|
for (i=1; i < orte_node_pool->size; i++) {
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -756,7 +770,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:orted_report_launch from daemon %s on node %s",
|
"%s plm:base:orted_report_launch from daemon %s on node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&dname), nodename));
|
ORTE_NAME_PRINT(&daemon->name), nodename));
|
||||||
|
|
||||||
/* look this node up, if necessary */
|
/* look this node up, if necessary */
|
||||||
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
|
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
|
||||||
@ -888,8 +902,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
if (1 == dname.vpid || orte_hetero_nodes) {
|
if (1 == dname.vpid || orte_hetero_nodes) {
|
||||||
/* the user has told us that something is different, so just store it */
|
/* the user has told us that something is different, so just store it */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s ADDING TOPOLOGY PER USER REQUEST",
|
"%s ADDING TOPOLOGY PER USER REQUEST TO NODE %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||||
t = OBJ_NEW(orte_topology_t);
|
t = OBJ_NEW(orte_topology_t);
|
||||||
/* filter the topology as we'll need it that way later */
|
/* filter the topology as we'll need it that way later */
|
||||||
opal_hwloc_base_filter_cpus(topo);
|
opal_hwloc_base_filter_cpus(topo);
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user