The missing orted bug is now fixed. orterun will not deadlock when
the program it try to spawn is missing. Description of the problem: When the rsh pls try to spawn a local process which is missing (such as a removed orted) the orterun deadlock. Description of the fix: The forked child deal with finding the program to be executed. If it fails to find it, then instead of calling exit (as a normal forked program is expected to do) it continue the execution using a execution path it was never expected to use (back in orterun and then main). Bad things happens as expected. Forcing the child to use exit when it fails to find the orted (and forcing the child to use exit everywhere instead of return) correct the logic of the rsh pls and make it behave as expected. This commit was SVN r14377.
Этот коммит содержится в:
родитель
ac23fa994d
Коммит
9e840fbe14
@ -172,7 +172,7 @@ static int orte_pls_rsh_probe(orte_mapped_node_t * node, orte_pls_rsh_shell * sh
|
||||
else if (pid == 0) { /* child */
|
||||
if (dup2(fd[1], 1) < 0) {
|
||||
opal_output(0, "pls:rsh: dup2 failed with errno=%d\n", errno);
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
exit(01);
|
||||
}
|
||||
/* Build argv array */
|
||||
argv = opal_argv_copy(mca_pls_rsh_component.agent_argv);
|
||||
@ -805,7 +805,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
rc = orte_pls_rsh_fill_exec_path (&exec_path);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
exit(-1); /* the forked process MUST exit */
|
||||
}
|
||||
} else {
|
||||
if (NULL != prefix_dir) {
|
||||
@ -816,7 +816,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
rc = orte_pls_rsh_fill_exec_path (&exec_path);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
exit(-1); /* the forked process MUST exit */
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -888,8 +888,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
var = getenv("HOME");
|
||||
if (NULL != var) {
|
||||
if (mca_pls_rsh_component.debug) {
|
||||
opal_output(0, "pls:rsh: changing to directory %s",
|
||||
var);
|
||||
opal_output(0, "pls:rsh: changing to directory %s", var);
|
||||
}
|
||||
/* Ignore errors -- what are we going to do?
|
||||
(and we ignore errors on the remote nodes
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user