1
1

The missing orted bug is now fixed. orterun will not deadlock when

the program it try to spawn is missing.

Description of the problem: When the rsh pls try to spawn a local
process which is missing (such as a removed orted) the orterun
deadlock.

Description of the fix: The forked child deal with finding the
program to be executed. If it fails to find it, then instead of
calling exit (as a normal forked program is expected to do) it 
continue the execution using a execution path it was never
expected to use (back in orterun and then main). Bad things 
happens as expected. Forcing the child to use exit when it fails
to find the orted (and forcing the child to use exit everywhere
instead of return) correct the logic of the rsh pls and make it
behave as expected.

This commit was SVN r14377.
Этот коммит содержится в:
George Bosilca 2007-04-14 17:36:27 +00:00
родитель ac23fa994d
Коммит 9e840fbe14

Просмотреть файл

@ -172,7 +172,7 @@ static int orte_pls_rsh_probe(orte_mapped_node_t * node, orte_pls_rsh_shell * sh
else if (pid == 0) { /* child */
if (dup2(fd[1], 1) < 0) {
opal_output(0, "pls:rsh: dup2 failed with errno=%d\n", errno);
return ORTE_ERR_IN_ERRNO;
exit(01);
}
/* Build argv array */
argv = opal_argv_copy(mca_pls_rsh_component.agent_argv);
@ -805,7 +805,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
rc = orte_pls_rsh_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
exit(-1); /* the forked process MUST exit */
}
} else {
if (NULL != prefix_dir) {
@ -816,7 +816,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
rc = orte_pls_rsh_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
exit(-1); /* the forked process MUST exit */
}
}
}
@ -888,8 +888,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
var = getenv("HOME");
if (NULL != var) {
if (mca_pls_rsh_component.debug) {
opal_output(0, "pls:rsh: changing to directory %s",
var);
opal_output(0, "pls:rsh: changing to directory %s", var);
}
/* Ignore errors -- what are we going to do?
(and we ignore errors on the remote nodes