1
1

Now that we have an "isolated" PLM component, we cannot just let rsh silently decline to run when it cannot find a launch agent - if we do, then we will -always- run on the local node. So if the user specifies a launch agent and we can't find it, then generate a pretty error message, report a fatal error back to the component select, and exit out.

This required modifying the mca_component_select function to actually check the return code on a component query - it was blissfully ignoring it.

Also do a little cleanup to avoid bombarding the user with multiple error messages.

Thanks to Patrick Begou for reporting the problem
Этот коммит содержится в:
Ralph Castain 2015-09-24 07:16:48 -07:00
родитель 8bac351a9a
Коммит 0140ff048d
8 изменённых файлов: 130 добавлений и 201 удалений

Просмотреть файл

@ -4,6 +4,7 @@
* Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -38,6 +39,7 @@ int mca_base_select(const char *type_name, int output_id,
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
int priority = 0, best_priority = INT32_MIN;
int rc;
*best_module = NULL;
*best_component = NULL;
@ -70,7 +72,18 @@ int mca_base_select(const char *type_name, int output_id,
"mca:base:select:(%5s) Querying component [%s]",
type_name, component->mca_component_name);
component->mca_query_component(&module, &priority);
rc = component->mca_query_component(&module, &priority);
if (OPAL_ERR_FATAL == rc) {
/* a fatal error was detected by this component - e.g., the
* user specified a required element and the component could
* not find it. In this case, we must not continue as we might
* find some other component that could run, causing us to do
* something the user didn't want */
return rc;
} else if (OPAL_SUCCESS != rc) {
/* silently skip this component */
continue;
}
/*
* If no module was returned, then skip component

Просмотреть файл

@ -224,31 +224,26 @@ static int rte_init(void)
* process stats if requested
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "opal_pstat_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
ORTE_ERROR_LOG(ret);
error = "opal_pstat_base_select";
goto error;
}
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_state_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_base_select";
goto error;
}
/* open the errmgr */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_open";
goto error;
}
@ -259,26 +254,26 @@ static int rte_init(void)
* first and select that component.
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_select";
if (ORTE_ERR_FATAL == ret) {
/* we already output a show_help - so keep down the verbage */
ret = ORTE_ERR_SILENT;
}
goto error;
}
/* if we were spawned by a singleton, our jobid was given to us */
if (NULL != orte_ess_base_jobid) {
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
ORTE_ERROR_LOG(ret);
error = "convert_string_to_jobid";
goto error;
}
ORTE_PROC_MY_NAME->vpid = 0;
} else {
if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_set_hnp_name";
goto error;
}
@ -304,7 +299,6 @@ static int rte_init(void)
orte_process_info.tmpdir_base,
orte_process_info.nodename, NULL,
ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
error = "orte_session_dir define";
goto error;
}
@ -318,7 +312,6 @@ static int rte_init(void)
orte_process_info.tmpdir_base,
orte_process_info.nodename, NULL,
ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
error = "orte_session_dir";
goto error;
}
@ -329,12 +322,10 @@ static int rte_init(void)
* OOB Layer
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_select";
goto error;
}
@ -343,30 +334,25 @@ static int rte_init(void)
* Runtime Messaging Layer
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_select";
goto error;
}
/* Messaging QoS Layer */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_qos_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,43 +33,26 @@
/**
* Function for selecting one component from all those that are
* Function for selecting one component from all those that are
* available.
*/
int orte_plm_base_select(void)
{
int exit_status = ORTE_SUCCESS;
int rc;
orte_plm_base_component_t *best_component = NULL;
orte_plm_base_module_t *best_module = NULL;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("plm", orte_plm_base_framework.framework_output,
&orte_plm_base_framework.framework_components,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected
*
* If we didn't find one, and we are a daemon, then default to retaining the proxy.
* Otherwise, if we didn't find one to select, that is unacceptable.
*/
if (ORTE_PROC_IS_DAEMON) {
/* don't record a selected component or flag selected
* so we finalize correctly - just leave the plm alone
* as it defaults to pointing at the proxy
*/
goto cleanup;
} else {
exit_status = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
if (OPAL_SUCCESS == (rc = mca_base_select("plm", orte_plm_base_framework.framework_output,
&orte_plm_base_framework.framework_components,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component))) {
/* Save the winner */
orte_plm = *best_module;
}
/* Save the winner */
orte_plm = *best_module;
cleanup:
return exit_status;
return rc;
}

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -18,6 +19,14 @@
#
# This is the US/English general help file for Open RTE's orterun.
#
[agent-not-found]
The value of the MCA parameter "plm_rsh_agent" was set to a path
that could not be found:
plm_rsh_agent: %s
Please either unset the parameter, or check that the path is correct
#
[no-local-orted]
The rsh PLS component was not able to find the executable "orted" in
your PATH or in the directory where Open MPI/OpenRTE was initially installed,

Просмотреть файл

@ -59,6 +59,8 @@ struct orte_plm_rsh_component_t {
bool no_tree_spawn;
int num_concurrent;
char *agent;
char *agent_path;
char **agent_argv;
bool assume_same_shell;
bool pass_environ_mca_params;
char *ssh_args;

Просмотреть файл

@ -43,8 +43,10 @@
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/basename.h"
#include "opal/util/path.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
@ -228,6 +230,7 @@ static int rsh_component_open(void)
/* initialize globals */
mca_plm_rsh_component.using_qrsh = false;
mca_plm_rsh_component.using_llspawn = false;
mca_plm_rsh_component.agent_argv = NULL;
/* lookup parameters */
if (mca_plm_rsh_component.num_concurrent <= 0) {
@ -256,48 +259,59 @@ static int rsh_component_query(mca_base_module_t **module, int *priority)
/* Check if we are under Grid Engine parallel environment by looking at several
* environment variables. If so, setup the path and argv[0]. */
if (!mca_plm_rsh_component.disable_qrsh &&
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
/* setup the search path for qrsh */
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
/* see if the agent is available */
if (ORTE_SUCCESS != rsh_launch_agent_lookup("qrsh", tmp)) {
/* can't be SGE */
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: unable to be used: SGE indicated but cannot find path "
"or execution permissions not set for launching agent qrsh",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
free(tmp);
*module = NULL;
return ORTE_ERROR;
if (NULL == mca_plm_rsh_component.agent) {
if (!mca_plm_rsh_component.disable_qrsh &&
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
/* setup the search path for qrsh */
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
/* see if the agent is available */
if (ORTE_SUCCESS != rsh_launch_agent_lookup("qrsh", tmp)) {
/* can't be SGE */
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: unable to be used: SGE indicated but cannot find path "
"or execution permissions not set for launching agent qrsh",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
free(tmp);
*module = NULL;
return ORTE_ERROR;
}
mca_plm_rsh_component.agent = tmp;
mca_plm_rsh_component.using_qrsh = true;
/* no tree spawn allowed under qrsh */
mca_plm_rsh_component.no_tree_spawn = true;
goto success;
} else if (!mca_plm_rsh_component.disable_llspawn &&
NULL != getenv("LOADL_STEP_ID")) {
/* We are running as a LOADLEVELER job.
* Search for llspawn in the users PATH */
if (ORTE_SUCCESS != rsh_launch_agent_lookup("llspawn", NULL)) {
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: unable to be used: LoadLeveler "
"indicated but cannot find path or execution "
"permissions not set for launching agent llspawn",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
*module = NULL;
return ORTE_ERROR;
}
mca_plm_rsh_component.agent = strdup("llspawn");
mca_plm_rsh_component.using_llspawn = true;
goto success;
}
free(tmp);
mca_plm_rsh_component.using_qrsh = true;
/* no tree spawn allowed under qrsh */
mca_plm_rsh_component.no_tree_spawn = true;
goto success;
} else if (!mca_plm_rsh_component.disable_llspawn &&
NULL != getenv("LOADL_STEP_ID")) {
/* We are running as a LOADLEVELER job.
Search for llspawn in the users PATH */
if (ORTE_SUCCESS != rsh_launch_agent_lookup("llspawn", NULL)) {
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: unable to be used: LoadLeveler "
"indicated but cannot find path or execution "
"permissions not set for launching agent llspawn",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
*module = NULL;
return ORTE_ERROR;
}
mca_plm_rsh_component.using_llspawn = true;
goto success;
}
/* if this isn't an Grid Engine or LoadLeveler environment,
see if MCA-specified agent (default: ssh:rsh) is available */
/* if this isn't an Grid Engine or LoadLeveler environment, or
* if the user specified a launch agent, look for it */
if (ORTE_SUCCESS != rsh_launch_agent_lookup(NULL, NULL)) {
/* if the user specified an agent and we couldn't find it,
* then we want to error out and not continue */
if (NULL != mca_plm_rsh_component.agent) {
orte_show_help("help-plm-rsh.txt", "agent-not-found", true,
mca_plm_rsh_component.agent);
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_FATAL;
}
/* this isn't an error - we just cannot be selected */
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:rsh: unable to be used: cannot find path "
@ -380,19 +394,48 @@ char **orte_plm_rsh_search(const char* agent_list, const char *path)
static int rsh_launch_agent_lookup(const char *agent_list, char *path)
{
char **tmp;
char *bname;
int i;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:rsh_lookup on agent %s path %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == agent_list) ? mca_plm_rsh_component.agent : agent_list,
(NULL == path) ? "NULL" : path));
if (NULL == (tmp = orte_plm_rsh_search(agent_list, path))) {
if (NULL == (mca_plm_rsh_component.agent_argv = orte_plm_rsh_search(agent_list, path))) {
return ORTE_ERR_NOT_FOUND;
}
/* if we got here, then one of the given agents could be found */
opal_argv_free(tmp);
/* if we got here, then one of the given agents could be found - the
* complete path is in the argv[0] position */
mca_plm_rsh_component.agent_path = strdup(mca_plm_rsh_component.agent_argv[0]);
bname = opal_basename(mca_plm_rsh_component.agent_argv[0]);
if (NULL == bname) {
return ORTE_SUCCESS;
}
/* replace the initial position with the basename */
free(mca_plm_rsh_component.agent_argv[0]);
mca_plm_rsh_component.agent_argv[0] = bname;
/* see if we need to add an xterm argument */
if (0 == strcmp(bname, "ssh")) {
/* if xterm option was given, add '-X', ensuring we don't do it twice */
if (NULL != orte_xterm) {
opal_argv_append_unique_nosize(&mca_plm_rsh_component.agent_argv, "-X", false);
} else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
/* if debug was not specified, and the user didn't explicitly
* specify X11 forwarding/non-forwarding, add "-x" if it
* isn't already there (check either case)
*/
for (i = 1; NULL != mca_plm_rsh_component.agent_argv[i]; ++i) {
if (0 == strcasecmp("-x", mca_plm_rsh_component.agent_argv[i])) {
break;
}
}
if (NULL == mca_plm_rsh_component.agent_argv[i]) {
opal_argv_append_nosize(&mca_plm_rsh_component.agent_argv, "-x");
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -164,7 +164,6 @@ static const char *orte_plm_rsh_shell_name[7] = {
*/
static void set_handler_default(int sig);
static orte_plm_rsh_shell_t find_shell(char *shell);
static int launch_agent_setup(const char *agent, char *path);
static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__;
static int rsh_probe(char *nodename,
orte_plm_rsh_shell_t *shell);
@ -175,8 +174,6 @@ static void launch_daemons(int fd, short args, void *cbdata);
static void process_launch_list(int fd, short args, void *cbdata);
/* local global storage */
static char *rsh_agent_path=NULL;
static char **rsh_agent_argv=NULL;
static int num_in_progress=0;
static opal_list_t launch_list;
static opal_event_t launch_event;
@ -186,51 +183,8 @@ static opal_event_t launch_event;
*/
static int rsh_init(void)
{
char *tmp;
int rc;
/* we were selected, so setup the launch agent */
if (mca_plm_rsh_component.using_qrsh) {
/* perform base setup for qrsh */
(void)asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
if (ORTE_SUCCESS != (rc = launch_agent_setup("qrsh", tmp))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
free(tmp);
/* automatically add -inherit and grid engine PE related flags */
opal_argv_append_nosize(&rsh_agent_argv, "-inherit");
/* Don't use the "-noshell" flag as qrsh would have a problem
* swallowing a long command */
opal_argv_append_nosize(&rsh_agent_argv, "-nostdin");
opal_argv_append_nosize(&rsh_agent_argv, "-V");
if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
opal_argv_append_nosize(&rsh_agent_argv, "-verbose");
tmp = opal_argv_join(rsh_agent_argv, ' ');
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: using \"%s\" for launching\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
free(tmp);
}
} else if(mca_plm_rsh_component.using_llspawn) {
/* perform base setup for llspawn */
if (ORTE_SUCCESS != (rc = launch_agent_setup("llspawn", NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output_verbose(1, orte_plm_base_framework.framework_output,
"%s plm:rsh: using \"%s\" for launching\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
rsh_agent_path);
} else {
/* not using qrsh or llspawn - use MCA-specified agent */
if (ORTE_SUCCESS != (rc = launch_agent_setup(mca_plm_rsh_component.agent, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* point to our launch command */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
launch_daemons, ORTE_SYS_PRI))) {
@ -371,8 +325,8 @@ static int setup_launch(int *argcptr, char ***argvptr,
/*
* Build argv array
*/
argv = opal_argv_copy(rsh_agent_argv);
argc = opal_argv_count(rsh_agent_argv);
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
/* if any ssh args were provided, now is the time to add them */
if (NULL != mca_plm_rsh_component.ssh_args) {
char **ssh_argv;
@ -722,7 +676,7 @@ static void ssh_child(int argc, char **argv)
* about remote launches here
*/
exec_argv = argv;
exec_path = strdup(rsh_agent_path);
exec_path = mca_plm_rsh_component.agent_path;
/* Don't let ssh slurp all of our stdin! */
fdin = open("/dev/null", O_RDWR);
@ -1377,67 +1331,6 @@ static orte_plm_rsh_shell_t find_shell(char *shell)
return ORTE_PLM_RSH_SHELL_UNKNOWN;
}
static int launch_agent_setup(const char *agent, char *path)
{
char *bname;
int i;
/* if no agent was provided, then report not found */
if (NULL == mca_plm_rsh_component.agent && NULL == agent) {
return ORTE_ERR_NOT_FOUND;
}
/* search for the argv */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:rsh_setup on agent %s path %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == agent) ? mca_plm_rsh_component.agent : agent,
(NULL == path) ? "NULL" : path));
rsh_agent_argv = orte_plm_rsh_search(agent, path);
if (0 == opal_argv_count(rsh_agent_argv)) {
/* nothing was found */
return ORTE_ERR_NOT_FOUND;
}
/* see if we can find the agent in the path */
rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path);
if (NULL == rsh_agent_path) {
/* not an error - just report not found */
opal_argv_free(rsh_agent_argv);
return ORTE_ERR_NOT_FOUND;
}
bname = opal_basename(rsh_agent_argv[0]);
if (NULL == bname) {
return ORTE_SUCCESS;
}
if (0 == strcmp(bname, "ssh")) {
/* if xterm option was given, add '-X', ensuring we don't do it twice */
if (NULL != orte_xterm) {
opal_argv_append_unique_nosize(&rsh_agent_argv, "-X", false);
} else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
/* if debug was not specified, and the user didn't explicitly
* specify X11 forwarding/non-forwarding, add "-x" if it
* isn't already there (check either case)
*/
for (i = 1; NULL != rsh_agent_argv[i]; ++i) {
if (0 == strcasecmp("-x", rsh_agent_argv[i])) {
break;
}
}
if (NULL == rsh_agent_argv[i]) {
opal_argv_append_nosize(&rsh_agent_argv, "-x");
}
}
}
free(bname);
/* the caller can append any additional argv's they desire */
return ORTE_SUCCESS;
}
/**
* Check the Shell variable and system type on the specified node
*/
@ -1479,8 +1372,8 @@ static int rsh_probe(char *nodename,
exit(01);
}
/* Build argv array */
argv = opal_argv_copy(rsh_agent_argv);
argc = opal_argv_count(rsh_agent_argv);
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
opal_argv_append(&argc, &argv, nodename);
opal_argv_append(&argc, &argv, "echo $SHELL");

Просмотреть файл

@ -556,11 +556,10 @@ orte_session_dir_finalize(orte_process_name_t *proc)
NULL == orte_process_info.top_session_dir) {
/* this should never happen - it means we are calling
* cleanup *before* properly setting up the session
* dir system. This leaves open the possibility of
* dir system. Protect against the possibility of
* accidentally removing directories we shouldn't
* touch
* touch by returning
*/
ORTE_ERROR_LOG(ORTE_ERR_NOT_INITIALIZED);
return ORTE_ERR_NOT_INITIALIZED;
}