Repair the tree spawn. The problem seems to come from the fact
that now the HNP send the messages using the routed component. In the case of tree spawn, when a intermediary node spawn a child it doesn't know how to forward a message to it, so when the node-map message is coming from the HNP (as there is nothing yet in the contact/routing table) the message is sent back the way it came. As a result the node-map message keeps jumping between the HNP and the first level orteds. The solution is to add a new option to the children orte_parent_uri, which is only set when the orted is _not_ directly spawned by the HNP. When this option is present on the argument list, the orted will add the parent to its routing, and force the parent to update his routes (by sending the URI). With this approach, the routing tree is build in same time as the processes are spawned, and all messages from the HNP can be routed to the leaves. However, this is far from an optimal solution. Right now, this so called tree spawn, only spawn the children in a tree without doing anything about the "connect back to the HNP" step. The HNP is flooded with reports from all the orted. The total number of messages is higher than in the non tree startup scheme, so we do not expect this approach to be scalable in the current incarnation. A complete overhaul of the tree startup is required in order improve the scalability. Stay tuned! This commit was SVN r21504.
Этот коммит содержится в:
родитель
6a00481285
Коммит
addaf7aaf8
@ -2,13 +2,15 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -172,6 +174,12 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* initialize the nidmaps */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* if we are using static ports, then we need to setup
|
||||
* the daemon info so the RML can function properly
|
||||
* without requiring a wireup stage. This must be done
|
||||
@ -179,12 +187,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
* own port, which we need in order to construct the nidmap
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
/* construct the nidmap arrays */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
if (NULL != orted_launch_cmd) {
|
||||
/* the launch cmd was given via regexp on the cmd line - parse
|
||||
* it to get the contact info
|
||||
@ -209,23 +211,16 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "failed to update routing tree";
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
/* initialize the nidmaps */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "failed to update routing tree";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Now provide a chance for the PLM
|
||||
* to perform any module-specific init functions. This
|
||||
* needs to occur AFTER the communications are setup
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1108,13 +1110,18 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
rml_uri = orte_rml.get_contact_info();
|
||||
} else {
|
||||
asprintf(¶m, "\"%s\"", orte_rml.get_contact_info() );
|
||||
opal_argv_append(argc, argv, "--parent-uri");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
|
||||
rml_uri = orte_process_info.my_hnp_uri;
|
||||
}
|
||||
asprintf(¶m, "\"%s\"", rml_uri);
|
||||
opal_argv_append(argc, argv, "--hnp-uri");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
|
||||
|
||||
/* if given, pass the node list */
|
||||
if (NULL != nodes) {
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
|
@ -12,6 +12,8 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -63,6 +65,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
@ -169,6 +172,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"URI for the HNP"},
|
||||
|
||||
{ "orte", "parent", "uri", '\0', NULL, "parent-uri", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"URI for the parent if tree launch is enabled."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
|
||||
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Direct the orted to separate from the current session"},
|
||||
@ -677,6 +684,37 @@ int orte_daemon(int argc, char *argv[])
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
mca_base_param_reg_string_name("orte", "parent_uri",
|
||||
"URI for the parent if tree launch is enabled.",
|
||||
true, false, NULL, &rml_uri);
|
||||
if (NULL != rml_uri) {
|
||||
orte_process_name_t parent;
|
||||
|
||||
/* set the contact info into the hash table */
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.set_contact_info(rml_uri))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
free(rml_uri);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL );
|
||||
if( ORTE_SUCCESS != ret ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
free(rml_uri);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
free(rml_uri);
|
||||
|
||||
if( 0 > (ret = orte_rml.send_buffer(&parent, buffer,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK, 0)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(buffer); /* done with this */
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user