From f91a95b3fe14ab94436f1f6c7b57f962c93d39e8 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 11 Oct 2006 18:43:13 +0000 Subject: [PATCH] Fix the bug that caused mpirun to hang when a remote executable wasn't found using the rsh launcher. Will now test on a remote node This commit was SVN r12095. --- orte/mca/odls/default/help-odls-default.txt | 21 ++++------- orte/mca/odls/default/odls_default_module.c | 15 +++----- orte/mca/odls/process/help-odls-process.txt | 21 ----------- orte/mca/odls/process/odls_process_module.c | 10 ++--- orte/mca/pls/base/help-pls-base.txt | 21 ----------- orte/mca/rmgr/Makefile.am | 1 + orte/mca/rmgr/base/Makefile.am | 2 + orte/mca/rmgr/base/help-rmgr-base.txt | 39 ++++++++++++++++++++ orte/mca/rmgr/base/rmgr_base_check_context.c | 8 ++-- 9 files changed, 62 insertions(+), 76 deletions(-) create mode 100644 orte/mca/rmgr/base/help-rmgr-base.txt diff --git a/orte/mca/odls/default/help-odls-default.txt b/orte/mca/odls/default/help-odls-default.txt index ba9c0e5b9d..f7df882969 100644 --- a/orte/mca/odls/default/help-odls-default.txt +++ b/orte/mca/odls/default/help-odls-default.txt @@ -18,20 +18,7 @@ # # This is the US/English general help file for Open RTE's orted launcher. # -[odls-default:chdir-error] -Failed to change to the working directory: -Host: %s -Directory: %s - -The error returned was "%s". Execution will now abort. -[odls-default:argv0-not-accessible] -Failed to find or execute the following executable: - -Host: %s -Executable: %s - -Cannot continue. [odls-default:could-not-kill] WARNING: A process refused to die! @@ -50,3 +37,11 @@ PID: %d Errno: %d This process may still be running and/or consuming resources. + +[orte-odls-default:execv-error] +Could not execute the executable "%s": %s + +This could mean that your PATH or executable name is wrong, or that you do not +have the necessary permissions. Please ensure that the executable is able to be +found and executed. + diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 45c997a1cb..c7495f9b2b 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -510,19 +510,15 @@ static int odls_default_fork_local_proc( orte_iof_base_setup_child(&opts); /* Try to change to the context cwd and check that the app - exists and is executable */ + exists and is executable The resource manager functions will + take care of outputting a pretty error message, if required + */ if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) { - opal_show_help("help-odls-default.txt", - "odls-default:chdir-error", - true, orte_system_info.nodename, context->cwd); - /* Tell the parent that Badness happened */ + /* Tell the parent that Badness happened */ write(p[1], &i, sizeof(int)); exit(-1); } if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) { - opal_show_help("help-odls-default.txt", - "odls-default:argv0-not-accessible", - true, orte_system_info.nodename, context->app); /* Tell the parent that Badness happened */ write(p[1], &i, sizeof(int)); exit(-1); @@ -655,7 +651,7 @@ static int odls_default_fork_local_proc( /* Exec the new executable */ execve(context->app, context->argv, environ_copy); - opal_show_help("help-orted-launcer.txt", "orted-launcher:execv-error", + opal_show_help("help-odls-default.txt", "orte-odls-default:execv-error", true, context->app, strerror(errno)); exit(-1); } else { @@ -688,7 +684,6 @@ static int odls_default_fork_local_proc( the SOH or else everyone else will hang. Don't bother checking whether or not this worked - just fire and forget */ - ORTE_ERROR_LOG(i); orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, rc); return ORTE_ERR_FATAL; break; diff --git a/orte/mca/odls/process/help-odls-process.txt b/orte/mca/odls/process/help-odls-process.txt index 199cf9ea0c..1c851b8f89 100755 --- a/orte/mca/odls/process/help-odls-process.txt +++ b/orte/mca/odls/process/help-odls-process.txt @@ -11,27 +11,6 @@ # # This is the US/English general help file for Open RTE's orterun. # -[orte-odls-process:chdir-error] -Failed to change to the working directory: - -Host: %s -Directory: %s - -The error returned was "%s". Execution will now abort. -[orte-odls-process:argv0-not-found] -Failed to find the following executable: - -Host: %s -Executable: %s - -Cannot continue. -[orte-odls-process:argv0-not-accessible] -Failed to find or execute the following executable: - -Host: %s -Executable: %s - -Cannot continue. [orte-odls-process:execv-error] Could not execute the executable "%s": %s diff --git a/orte/mca/odls/process/odls_process_module.c b/orte/mca/odls/process/odls_process_module.c index 758ed5c829..0073d60e30 100755 --- a/orte/mca/odls/process/odls_process_module.c +++ b/orte/mca/odls/process/odls_process_module.c @@ -442,18 +442,14 @@ static int orte_odls_process_fork_local_proc( } /* Try to change to the context cwd and check that the app - exists and is executable */ + exists and is executable. The RMGR functions will print + out a pretty error message if either of these operations fails + */ if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) { - opal_show_help("help-odls-default.txt", - "odls-default:chdir-error", - true, orte_system_info.nodename, context->cwd); /* Tell the parent that Badness happened */ return ORTE_ERR_FATAL; } if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) { - opal_show_help("help-odls-default.txt", - "odls-default:argv0-not-accessible", - true, orte_system_info.nodename, context->app); /* Tell the parent that Badness happened */ return ORTE_ERR_FATAL; } diff --git a/orte/mca/pls/base/help-pls-base.txt b/orte/mca/pls/base/help-pls-base.txt index c87046d588..7351753214 100644 --- a/orte/mca/pls/base/help-pls-base.txt +++ b/orte/mca/pls/base/help-pls-base.txt @@ -22,24 +22,3 @@ No available launching agents were found. This is an unusual error; it means that Open RTE was unable to find any mechanism to launch proceses, and therefore is unable to start the process(es) required by your application. -[chdir-error] -Failed to change to the working directory: - -Host: %s -Directory: %s - -The error returned was "%s". Execution will now abort. -[argv0-not-found] -Failed to find the following executable: - -Host: %s -Executable: %s - -Cannot continue. -[argv0-not-accessible] -Failed to find or execute the following executable: - -Host: %s -Executable: %s - -Cannot continue. diff --git a/orte/mca/rmgr/Makefile.am b/orte/mca/rmgr/Makefile.am index 2546fec099..358ee97b07 100644 --- a/orte/mca/rmgr/Makefile.am +++ b/orte/mca/rmgr/Makefile.am @@ -22,6 +22,7 @@ libmca_rmgr_la_SOURCES = # header setup nobase_orte_HEADERS = +dist_pkgdata_DATA = # local files headers = rmgr.h rmgr_types.h diff --git a/orte/mca/rmgr/base/Makefile.am b/orte/mca/rmgr/base/Makefile.am index e09f5b2e22..bbb28d8731 100644 --- a/orte/mca/rmgr/base/Makefile.am +++ b/orte/mca/rmgr/base/Makefile.am @@ -16,6 +16,8 @@ # $HEADER$ # +dist_pkgdata_DATA += base/help-rmgr-base.txt + headers += \ base/rmgr_private.h \ base/base.h diff --git a/orte/mca/rmgr/base/help-rmgr-base.txt b/orte/mca/rmgr/base/help-rmgr-base.txt new file mode 100644 index 0000000000..83df4f798a --- /dev/null +++ b/orte/mca/rmgr/base/help-rmgr-base.txt @@ -0,0 +1,39 @@ +# -*- text -*- +# +# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2006 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[chdir-error] +Failed to change to the working directory: + +Host: %s +Directory: %s + +The error returned was "%s". Execution will now abort. +[argv0-not-found] +Failed to find the following executable: + +Host: %s +Executable: %s + +Cannot continue. +[argv0-not-accessible] +Failed to find or execute the following executable: + +Host: %s +Executable: %s + +Cannot continue. diff --git a/orte/mca/rmgr/base/rmgr_base_check_context.c b/orte/mca/rmgr/base/rmgr_base_check_context.c index 39aa5db1e2..0fb7ad3eb7 100644 --- a/orte/mca/rmgr/base/rmgr_base_check_context.c +++ b/orte/mca/rmgr/base/rmgr_base_check_context.c @@ -77,7 +77,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context, was, barf because they specifically asked for something we can't provide. */ if (context->user_specified_cwd) { - opal_show_help("help-pls-base.txt", "chdir-error", + opal_show_help("help-rmgr-base.txt", "chdir-error", true, hostname, context->cwd, strerror(errno)); return ORTE_ERR_NOT_FOUND; } @@ -97,7 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context, good = false; } if (!good) { - opal_show_help("help-pls-base.txt", "chdir-error", + opal_show_help("help-rmgr-base.txt", "chdir-error", true, tmp, strerror(errno)); return ORTE_ERR_NOT_FOUND; } @@ -152,7 +152,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context) free(tmp); tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd); if (NULL == tmp) { - opal_show_help("help-pls-base.txt", + opal_show_help("help-rmgr-base.txt", "argv0-not-found", true, hostname, context->argv[0]); return ORTE_ERR_NOT_FOUND; @@ -161,7 +161,7 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context) context->app = tmp; } else { if (0 != access(context->app, X_OK)) { - opal_show_help("help-pls-base.txt", + opal_show_help("help-rmgr-base.txt", "argv0-not-accessible", true, hostname, context->argv[0]); return ORTE_ERR_NOT_FOUND;