1
1

Fix the bug that caused mpirun to hang when a remote executable wasn't found using the rsh launcher. Will now test on a remote node

This commit was SVN r12095.
Этот коммит содержится в:
Ralph Castain 2006-10-11 18:43:13 +00:00
родитель 7dc9995955
Коммит f91a95b3fe
9 изменённых файлов: 62 добавлений и 76 удалений

Просмотреть файл

@ -18,20 +18,7 @@
#
# This is the US/English general help file for Open RTE's orted launcher.
#
[odls-default:chdir-error]
Failed to change to the working directory:
Host: %s
Directory: %s
The error returned was "%s". Execution will now abort.
[odls-default:argv0-not-accessible]
Failed to find or execute the following executable:
Host: %s
Executable: %s
Cannot continue.
[odls-default:could-not-kill]
WARNING: A process refused to die!
@ -50,3 +37,11 @@ PID: %d
Errno: %d
This process may still be running and/or consuming resources.
[orte-odls-default:execv-error]
Could not execute the executable "%s": %s
This could mean that your PATH or executable name is wrong, or that you do not
have the necessary permissions. Please ensure that the executable is able to be
found and executed.

Просмотреть файл

@ -510,19 +510,15 @@ static int odls_default_fork_local_proc(
orte_iof_base_setup_child(&opts);
/* Try to change to the context cwd and check that the app
exists and is executable */
exists and is executable The resource manager functions will
take care of outputting a pretty error message, if required
*/
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {
opal_show_help("help-odls-default.txt",
"odls-default:chdir-error",
true, orte_system_info.nodename, context->cwd);
/* Tell the parent that Badness happened */
/* Tell the parent that Badness happened */
write(p[1], &i, sizeof(int));
exit(-1);
}
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {
opal_show_help("help-odls-default.txt",
"odls-default:argv0-not-accessible",
true, orte_system_info.nodename, context->app);
/* Tell the parent that Badness happened */
write(p[1], &i, sizeof(int));
exit(-1);
@ -655,7 +651,7 @@ static int odls_default_fork_local_proc(
/* Exec the new executable */
execve(context->app, context->argv, environ_copy);
opal_show_help("help-orted-launcer.txt", "orted-launcher:execv-error",
opal_show_help("help-odls-default.txt", "orte-odls-default:execv-error",
true, context->app, strerror(errno));
exit(-1);
} else {
@ -688,7 +684,6 @@ static int odls_default_fork_local_proc(
the SOH or else everyone else will hang. Don't bother
checking whether or not this worked - just fire and forget
*/
ORTE_ERROR_LOG(i);
orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, rc);
return ORTE_ERR_FATAL;
break;

Просмотреть файл

@ -11,27 +11,6 @@
#
# This is the US/English general help file for Open RTE's orterun.
#
[orte-odls-process:chdir-error]
Failed to change to the working directory:
Host: %s
Directory: %s
The error returned was "%s". Execution will now abort.
[orte-odls-process:argv0-not-found]
Failed to find the following executable:
Host: %s
Executable: %s
Cannot continue.
[orte-odls-process:argv0-not-accessible]
Failed to find or execute the following executable:
Host: %s
Executable: %s
Cannot continue.
[orte-odls-process:execv-error]
Could not execute the executable "%s": %s

Просмотреть файл

@ -442,18 +442,14 @@ static int orte_odls_process_fork_local_proc(
}
/* Try to change to the context cwd and check that the app
exists and is executable */
exists and is executable. The RMGR functions will print
out a pretty error message if either of these operations fails
*/
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {
opal_show_help("help-odls-default.txt",
"odls-default:chdir-error",
true, orte_system_info.nodename, context->cwd);
/* Tell the parent that Badness happened */
return ORTE_ERR_FATAL;
}
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {
opal_show_help("help-odls-default.txt",
"odls-default:argv0-not-accessible",
true, orte_system_info.nodename, context->app);
/* Tell the parent that Badness happened */
return ORTE_ERR_FATAL;
}

Просмотреть файл

@ -22,24 +22,3 @@ No available launching agents were found.
This is an unusual error; it means that Open RTE was unable to find
any mechanism to launch proceses, and therefore is unable to start the
process(es) required by your application.
[chdir-error]
Failed to change to the working directory:
Host: %s
Directory: %s
The error returned was "%s". Execution will now abort.
[argv0-not-found]
Failed to find the following executable:
Host: %s
Executable: %s
Cannot continue.
[argv0-not-accessible]
Failed to find or execute the following executable:
Host: %s
Executable: %s
Cannot continue.

Просмотреть файл

@ -22,6 +22,7 @@ libmca_rmgr_la_SOURCES =
# header setup
nobase_orte_HEADERS =
dist_pkgdata_DATA =
# local files
headers = rmgr.h rmgr_types.h

Просмотреть файл

@ -16,6 +16,8 @@
# $HEADER$
#
dist_pkgdata_DATA += base/help-rmgr-base.txt
headers += \
base/rmgr_private.h \
base/base.h

39
orte/mca/rmgr/base/help-rmgr-base.txt Обычный файл
Просмотреть файл

@ -0,0 +1,39 @@
# -*- text -*-
#
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2006 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[chdir-error]
Failed to change to the working directory:
Host: %s
Directory: %s
The error returned was "%s". Execution will now abort.
[argv0-not-found]
Failed to find the following executable:
Host: %s
Executable: %s
Cannot continue.
[argv0-not-accessible]
Failed to find or execute the following executable:
Host: %s
Executable: %s
Cannot continue.

Просмотреть файл

@ -77,7 +77,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
was, barf because they specifically asked for something we
can't provide. */
if (context->user_specified_cwd) {
opal_show_help("help-pls-base.txt", "chdir-error",
opal_show_help("help-rmgr-base.txt", "chdir-error",
true, hostname, context->cwd, strerror(errno));
return ORTE_ERR_NOT_FOUND;
}
@ -97,7 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
good = false;
}
if (!good) {
opal_show_help("help-pls-base.txt", "chdir-error",
opal_show_help("help-rmgr-base.txt", "chdir-error",
true, tmp, strerror(errno));
return ORTE_ERR_NOT_FOUND;
}
@ -152,7 +152,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
free(tmp);
tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd);
if (NULL == tmp) {
opal_show_help("help-pls-base.txt",
opal_show_help("help-rmgr-base.txt",
"argv0-not-found",
true, hostname, context->argv[0]);
return ORTE_ERR_NOT_FOUND;
@ -161,7 +161,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
context->app = tmp;
} else {
if (0 != access(context->app, X_OK)) {
opal_show_help("help-pls-base.txt",
opal_show_help("help-rmgr-base.txt",
"argv0-not-accessible",
true, hostname, context->argv[0]);
return ORTE_ERR_NOT_FOUND;