1
1

Fix tree spawn in the rsh/qrsh environment

This commit was SVN r26631.
Этот коммит содержится в:
Ralph Castain 2012-06-21 21:29:28 +00:00
родитель 249066e06d
Коммит e9591f2563
3 изменённых файлов: 40 добавлений и 15 удалений

Просмотреть файл

@ -897,11 +897,18 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
free(param); free(param);
} }
/* if we want to use a common port, tell the daemon to do so */
if (orte_use_common_port) { if (orte_use_common_port) {
/* tell the daemon to use the common port */
opal_argv_append(argc, argv, "--use-common-port"); opal_argv_append(argc, argv, "--use-common-port");
} }
/* warn the daemons if we are using a tree spawn pattern so they
* know they shouldn't do a rollup on their callback
*/
if (NULL != orte_tree_launch_cmd) {
opal_argv_append(argc, argv, "--tree-spawn");
}
/* pass along any cmd line MCA params provided to mpirun, /* pass along any cmd line MCA params provided to mpirun,
* being sure to "purge" any that would cause problems * being sure to "purge" any that would cause problems
* on backend nodes * on backend nodes

Просмотреть файл

@ -785,6 +785,11 @@ static int remote_spawn(opal_buffer_t *launch)
goto cleanup; goto cleanup;
} }
/* ensure the system knows we are not using common ports since we are
* doing a tree spawn
*/
orte_use_common_port = false;
/* setup the launch */ /* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1, if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
&proc_vpid_index, prefix))) { &proc_vpid_index, prefix))) {
@ -793,6 +798,9 @@ static int remote_spawn(opal_buffer_t *launch)
goto cleanup; goto cleanup;
} }
/* tell the daemon we are in a tree spawn */
opal_argv_append(&argc, &argv, "--tree-spawn");
/* get the daemon job object */ /* get the daemon job object */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
@ -1068,21 +1076,19 @@ static void launch_daemons(int fd, short args, void *cbdata)
} }
} }
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
&proc_vpid_index, prefix_dir))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* if we are tree launching, find our children and create the launch cmd */ /* if we are tree launching, find our children and create the launch cmd */
if (mca_plm_rsh_component.tree_spawn) { if (mca_plm_rsh_component.tree_spawn) {
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN; orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;
opal_byte_object_t bo, *boptr; opal_byte_object_t bo, *boptr;
orte_job_t *jdatorted; orte_job_t *jdatorted;
orte_tree_launch_cmd= OBJ_NEW(opal_buffer_t); /* ensure all systems know we are not using a common port since we
* are doing a tree spawn
*/
orte_use_common_port = false;
/* get the tree spawn buffer */
orte_tree_launch_cmd = OBJ_NEW(opal_buffer_t);
/* insert the tree_spawn cmd */ /* insert the tree_spawn cmd */
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -1123,6 +1129,13 @@ static void launch_daemons(int fd, short args, void *cbdata)
orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll); orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll);
} }
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
&proc_vpid_index, prefix_dir))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* /*
* Iterate through each of the nodes * Iterate through each of the nodes
*/ */

Просмотреть файл

@ -117,6 +117,7 @@ static struct {
int fail_delay; int fail_delay;
bool abort; bool abort;
bool mapreduce; bool mapreduce;
bool tree_spawn;
} orted_globals; } orted_globals;
/* /*
@ -172,6 +173,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Use the same port as the HNP."}, "Use the same port as the HNP."},
{ NULL, NULL, NULL, '\0', NULL, "tree-spawn", 0,
&orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
"Tree spawn is underway"},
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0, { NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL, &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
"Direct the orted to separate from the current session"}, "Direct the orted to separate from the current session"},
@ -718,7 +723,7 @@ int orte_daemon(int argc, char *argv[])
} }
#endif #endif
if (orte_static_ports || orte_use_common_port) { if ((orte_static_ports || orte_use_common_port) && !orted_globals.tree_spawn) {
/* use the rollup collective to send our data to the HNP /* use the rollup collective to send our data to the HNP
* so we minimize the HNP bottleneck * so we minimize the HNP bottleneck
*/ */