Fix tree spawn in the rsh/qrsh environment
This commit was SVN r26631.
Этот коммит содержится в:
родитель
249066e06d
Коммит
e9591f2563
@ -897,11 +897,18 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
|||||||
free(param);
|
free(param);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we want to use a common port, tell the daemon to do so */
|
|
||||||
if (orte_use_common_port) {
|
if (orte_use_common_port) {
|
||||||
|
/* tell the daemon to use the common port */
|
||||||
opal_argv_append(argc, argv, "--use-common-port");
|
opal_argv_append(argc, argv, "--use-common-port");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* warn the daemons if we are using a tree spawn pattern so they
|
||||||
|
* know they shouldn't do a rollup on their callback
|
||||||
|
*/
|
||||||
|
if (NULL != orte_tree_launch_cmd) {
|
||||||
|
opal_argv_append(argc, argv, "--tree-spawn");
|
||||||
|
}
|
||||||
|
|
||||||
/* pass along any cmd line MCA params provided to mpirun,
|
/* pass along any cmd line MCA params provided to mpirun,
|
||||||
* being sure to "purge" any that would cause problems
|
* being sure to "purge" any that would cause problems
|
||||||
* on backend nodes
|
* on backend nodes
|
||||||
|
@ -785,6 +785,11 @@ static int remote_spawn(opal_buffer_t *launch)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ensure the system knows we are not using common ports since we are
|
||||||
|
* doing a tree spawn
|
||||||
|
*/
|
||||||
|
orte_use_common_port = false;
|
||||||
|
|
||||||
/* setup the launch */
|
/* setup the launch */
|
||||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
|
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
|
||||||
&proc_vpid_index, prefix))) {
|
&proc_vpid_index, prefix))) {
|
||||||
@ -793,6 +798,9 @@ static int remote_spawn(opal_buffer_t *launch)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* tell the daemon we are in a tree spawn */
|
||||||
|
opal_argv_append(&argc, &argv, "--tree-spawn");
|
||||||
|
|
||||||
/* get the daemon job object */
|
/* get the daemon job object */
|
||||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
@ -1068,21 +1076,19 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the launch */
|
|
||||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
|
||||||
&proc_vpid_index, prefix_dir))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* if we are tree launching, find our children and create the launch cmd */
|
/* if we are tree launching, find our children and create the launch cmd */
|
||||||
if (mca_plm_rsh_component.tree_spawn) {
|
if (mca_plm_rsh_component.tree_spawn) {
|
||||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;
|
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;
|
||||||
opal_byte_object_t bo, *boptr;
|
opal_byte_object_t bo, *boptr;
|
||||||
orte_job_t *jdatorted;
|
orte_job_t *jdatorted;
|
||||||
|
|
||||||
orte_tree_launch_cmd= OBJ_NEW(opal_buffer_t);
|
/* ensure all systems know we are not using a common port since we
|
||||||
|
* are doing a tree spawn
|
||||||
|
*/
|
||||||
|
orte_use_common_port = false;
|
||||||
|
|
||||||
|
/* get the tree spawn buffer */
|
||||||
|
orte_tree_launch_cmd = OBJ_NEW(opal_buffer_t);
|
||||||
/* insert the tree_spawn cmd */
|
/* insert the tree_spawn cmd */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -1123,6 +1129,13 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll);
|
orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup the launch */
|
||||||
|
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||||
|
&proc_vpid_index, prefix_dir))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Iterate through each of the nodes
|
* Iterate through each of the nodes
|
||||||
*/
|
*/
|
||||||
|
@ -117,6 +117,7 @@ static struct {
|
|||||||
int fail_delay;
|
int fail_delay;
|
||||||
bool abort;
|
bool abort;
|
||||||
bool mapreduce;
|
bool mapreduce;
|
||||||
|
bool tree_spawn;
|
||||||
} orted_globals;
|
} orted_globals;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -172,6 +173,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
|||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Use the same port as the HNP."},
|
"Use the same port as the HNP."},
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "tree-spawn", 0,
|
||||||
|
&orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Tree spawn is underway"},
|
||||||
|
|
||||||
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
|
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
|
||||||
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Direct the orted to separate from the current session"},
|
"Direct the orted to separate from the current session"},
|
||||||
@ -718,7 +723,7 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (orte_static_ports || orte_use_common_port) {
|
if ((orte_static_ports || orte_use_common_port) && !orted_globals.tree_spawn) {
|
||||||
/* use the rollup collective to send our data to the HNP
|
/* use the rollup collective to send our data to the HNP
|
||||||
* so we minimize the HNP bottleneck
|
* so we minimize the HNP bottleneck
|
||||||
*/
|
*/
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user