Fix the bug that caused mpirun to hang when a remote executable wasn't found using the rsh launcher. Will now test on a remote node
This commit was SVN r12095.
Этот коммит содержится в:
родитель
7dc9995955
Коммит
f91a95b3fe
@ -18,20 +18,7 @@
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orted launcher.
|
||||
#
|
||||
[odls-default:chdir-error]
|
||||
Failed to change to the working directory:
|
||||
|
||||
Host: %s
|
||||
Directory: %s
|
||||
|
||||
The error returned was "%s". Execution will now abort.
|
||||
[odls-default:argv0-not-accessible]
|
||||
Failed to find or execute the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[odls-default:could-not-kill]
|
||||
WARNING: A process refused to die!
|
||||
|
||||
@ -50,3 +37,11 @@ PID: %d
|
||||
Errno: %d
|
||||
|
||||
This process may still be running and/or consuming resources.
|
||||
|
||||
[orte-odls-default:execv-error]
|
||||
Could not execute the executable "%s": %s
|
||||
|
||||
This could mean that your PATH or executable name is wrong, or that you do not
|
||||
have the necessary permissions. Please ensure that the executable is able to be
|
||||
found and executed.
|
||||
|
||||
|
@ -510,19 +510,15 @@ static int odls_default_fork_local_proc(
|
||||
orte_iof_base_setup_child(&opts);
|
||||
|
||||
/* Try to change to the context cwd and check that the app
|
||||
exists and is executable */
|
||||
exists and is executable The resource manager functions will
|
||||
take care of outputting a pretty error message, if required
|
||||
*/
|
||||
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {
|
||||
opal_show_help("help-odls-default.txt",
|
||||
"odls-default:chdir-error",
|
||||
true, orte_system_info.nodename, context->cwd);
|
||||
/* Tell the parent that Badness happened */
|
||||
/* Tell the parent that Badness happened */
|
||||
write(p[1], &i, sizeof(int));
|
||||
exit(-1);
|
||||
}
|
||||
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {
|
||||
opal_show_help("help-odls-default.txt",
|
||||
"odls-default:argv0-not-accessible",
|
||||
true, orte_system_info.nodename, context->app);
|
||||
/* Tell the parent that Badness happened */
|
||||
write(p[1], &i, sizeof(int));
|
||||
exit(-1);
|
||||
@ -655,7 +651,7 @@ static int odls_default_fork_local_proc(
|
||||
/* Exec the new executable */
|
||||
|
||||
execve(context->app, context->argv, environ_copy);
|
||||
opal_show_help("help-orted-launcer.txt", "orted-launcher:execv-error",
|
||||
opal_show_help("help-odls-default.txt", "orte-odls-default:execv-error",
|
||||
true, context->app, strerror(errno));
|
||||
exit(-1);
|
||||
} else {
|
||||
@ -688,7 +684,6 @@ static int odls_default_fork_local_proc(
|
||||
the SOH or else everyone else will hang. Don't bother
|
||||
checking whether or not this worked - just fire and forget
|
||||
*/
|
||||
ORTE_ERROR_LOG(i);
|
||||
orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, rc);
|
||||
return ORTE_ERR_FATAL;
|
||||
break;
|
||||
|
@ -11,27 +11,6 @@
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orte-odls-process:chdir-error]
|
||||
Failed to change to the working directory:
|
||||
|
||||
Host: %s
|
||||
Directory: %s
|
||||
|
||||
The error returned was "%s". Execution will now abort.
|
||||
[orte-odls-process:argv0-not-found]
|
||||
Failed to find the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[orte-odls-process:argv0-not-accessible]
|
||||
Failed to find or execute the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[orte-odls-process:execv-error]
|
||||
Could not execute the executable "%s": %s
|
||||
|
||||
|
@ -442,18 +442,14 @@ static int orte_odls_process_fork_local_proc(
|
||||
}
|
||||
|
||||
/* Try to change to the context cwd and check that the app
|
||||
exists and is executable */
|
||||
exists and is executable. The RMGR functions will print
|
||||
out a pretty error message if either of these operations fails
|
||||
*/
|
||||
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {
|
||||
opal_show_help("help-odls-default.txt",
|
||||
"odls-default:chdir-error",
|
||||
true, orte_system_info.nodename, context->cwd);
|
||||
/* Tell the parent that Badness happened */
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {
|
||||
opal_show_help("help-odls-default.txt",
|
||||
"odls-default:argv0-not-accessible",
|
||||
true, orte_system_info.nodename, context->app);
|
||||
/* Tell the parent that Badness happened */
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
@ -22,24 +22,3 @@ No available launching agents were found.
|
||||
This is an unusual error; it means that Open RTE was unable to find
|
||||
any mechanism to launch proceses, and therefore is unable to start the
|
||||
process(es) required by your application.
|
||||
[chdir-error]
|
||||
Failed to change to the working directory:
|
||||
|
||||
Host: %s
|
||||
Directory: %s
|
||||
|
||||
The error returned was "%s". Execution will now abort.
|
||||
[argv0-not-found]
|
||||
Failed to find the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[argv0-not-accessible]
|
||||
Failed to find or execute the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
|
@ -22,6 +22,7 @@ libmca_rmgr_la_SOURCES =
|
||||
|
||||
# header setup
|
||||
nobase_orte_HEADERS =
|
||||
dist_pkgdata_DATA =
|
||||
|
||||
# local files
|
||||
headers = rmgr.h rmgr_types.h
|
||||
|
@ -16,6 +16,8 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA += base/help-rmgr-base.txt
|
||||
|
||||
headers += \
|
||||
base/rmgr_private.h \
|
||||
base/base.h
|
||||
|
39
orte/mca/rmgr/base/help-rmgr-base.txt
Обычный файл
39
orte/mca/rmgr/base/help-rmgr-base.txt
Обычный файл
@ -0,0 +1,39 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[chdir-error]
|
||||
Failed to change to the working directory:
|
||||
|
||||
Host: %s
|
||||
Directory: %s
|
||||
|
||||
The error returned was "%s". Execution will now abort.
|
||||
[argv0-not-found]
|
||||
Failed to find the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[argv0-not-accessible]
|
||||
Failed to find or execute the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
@ -77,7 +77,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
|
||||
was, barf because they specifically asked for something we
|
||||
can't provide. */
|
||||
if (context->user_specified_cwd) {
|
||||
opal_show_help("help-pls-base.txt", "chdir-error",
|
||||
opal_show_help("help-rmgr-base.txt", "chdir-error",
|
||||
true, hostname, context->cwd, strerror(errno));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
@ -97,7 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
|
||||
good = false;
|
||||
}
|
||||
if (!good) {
|
||||
opal_show_help("help-pls-base.txt", "chdir-error",
|
||||
opal_show_help("help-rmgr-base.txt", "chdir-error",
|
||||
true, tmp, strerror(errno));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
@ -152,7 +152,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
|
||||
free(tmp);
|
||||
tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd);
|
||||
if (NULL == tmp) {
|
||||
opal_show_help("help-pls-base.txt",
|
||||
opal_show_help("help-rmgr-base.txt",
|
||||
"argv0-not-found",
|
||||
true, hostname, context->argv[0]);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
@ -161,7 +161,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
|
||||
context->app = tmp;
|
||||
} else {
|
||||
if (0 != access(context->app, X_OK)) {
|
||||
opal_show_help("help-pls-base.txt",
|
||||
opal_show_help("help-rmgr-base.txt",
|
||||
"argv0-not-accessible",
|
||||
true, hostname, context->argv[0]);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user