Fix tree spawn in the rsh/qrsh environment
This commit was SVN r26631.
Этот коммит содержится в:
родитель
249066e06d
Коммит
e9591f2563
@ -897,11 +897,18 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if we want to use a common port, tell the daemon to do so */
|
||||
if (orte_use_common_port) {
|
||||
/* tell the daemon to use the common port */
|
||||
opal_argv_append(argc, argv, "--use-common-port");
|
||||
}
|
||||
|
||||
/* warn the daemons if we are using a tree spawn pattern so they
|
||||
* know they shouldn't do a rollup on their callback
|
||||
*/
|
||||
if (NULL != orte_tree_launch_cmd) {
|
||||
opal_argv_append(argc, argv, "--tree-spawn");
|
||||
}
|
||||
|
||||
/* pass along any cmd line MCA params provided to mpirun,
|
||||
* being sure to "purge" any that would cause problems
|
||||
* on backend nodes
|
||||
@ -958,8 +965,8 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
mca_base_param_lookup_string(loc_id, &amca_param_prefix);
|
||||
if( NULL != amca_param_prefix ) {
|
||||
/* Could also use the short version '-am'
|
||||
* but being verbose has some value
|
||||
*/
|
||||
* but being verbose has some value
|
||||
*/
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
opal_argv_append(argc, argv, "mca_base_param_file_prefix");
|
||||
opal_argv_append(argc, argv, amca_param_prefix);
|
||||
|
@ -785,6 +785,11 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* ensure the system knows we are not using common ports since we are
|
||||
* doing a tree spawn
|
||||
*/
|
||||
orte_use_common_port = false;
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
|
||||
&proc_vpid_index, prefix))) {
|
||||
@ -792,7 +797,10 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
OBJ_DESTRUCT(&coll);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/* tell the daemon we are in a tree spawn */
|
||||
opal_argv_append(&argc, &argv, "--tree-spawn");
|
||||
|
||||
/* get the daemon job object */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
@ -1067,22 +1075,20 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||
&proc_vpid_index, prefix_dir))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* if we are tree launching, find our children and create the launch cmd */
|
||||
if (mca_plm_rsh_component.tree_spawn) {
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
orte_job_t *jdatorted;
|
||||
|
||||
orte_tree_launch_cmd= OBJ_NEW(opal_buffer_t);
|
||||
/* ensure all systems know we are not using a common port since we
|
||||
* are doing a tree spawn
|
||||
*/
|
||||
orte_use_common_port = false;
|
||||
|
||||
/* get the tree spawn buffer */
|
||||
orte_tree_launch_cmd = OBJ_NEW(opal_buffer_t);
|
||||
/* insert the tree_spawn cmd */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -1123,6 +1129,13 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll);
|
||||
}
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||
&proc_vpid_index, prefix_dir))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes
|
||||
*/
|
||||
|
@ -117,6 +117,7 @@ static struct {
|
||||
int fail_delay;
|
||||
bool abort;
|
||||
bool mapreduce;
|
||||
bool tree_spawn;
|
||||
} orted_globals;
|
||||
|
||||
/*
|
||||
@ -172,6 +173,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Use the same port as the HNP."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "tree-spawn", 0,
|
||||
&orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Tree spawn is underway"},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
|
||||
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Direct the orted to separate from the current session"},
|
||||
@ -718,7 +723,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
if (orte_static_ports || orte_use_common_port) {
|
||||
if ((orte_static_ports || orte_use_common_port) && !orted_globals.tree_spawn) {
|
||||
/* use the rollup collective to send our data to the HNP
|
||||
* so we minimize the HNP bottleneck
|
||||
*/
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user