Now that we have an "isolated" PLM component, we cannot just let rsh silently decline to run when it cannot find a launch agent - if we do, then we will -always- run on the local node. So if the user specifies a launch agent and we can't find it, then generate a pretty error message, report a fatal error back to the component select, and exit out.
This required modifying the mca_component_select function to actually check the return code on a component query - it was blissfully ignoring it. Also do a little cleanup to avoid bombarding the user with multiple error messages. Thanks to Patrick Begou for reporting the problem
Этот коммит содержится в:
родитель
8bac351a9a
Коммит
0140ff048d
@ -4,6 +4,7 @@
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -38,6 +39,7 @@ int mca_base_select(const char *type_name, int output_id,
|
||||
mca_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
int priority = 0, best_priority = INT32_MIN;
|
||||
int rc;
|
||||
|
||||
*best_module = NULL;
|
||||
*best_component = NULL;
|
||||
@ -70,7 +72,18 @@ int mca_base_select(const char *type_name, int output_id,
|
||||
"mca:base:select:(%5s) Querying component [%s]",
|
||||
type_name, component->mca_component_name);
|
||||
|
||||
component->mca_query_component(&module, &priority);
|
||||
rc = component->mca_query_component(&module, &priority);
|
||||
if (OPAL_ERR_FATAL == rc) {
|
||||
/* a fatal error was detected by this component - e.g., the
|
||||
* user specified a required element and the component could
|
||||
* not find it. In this case, we must not continue as we might
|
||||
* find some other component that could run, causing us to do
|
||||
* something the user didn't want */
|
||||
return rc;
|
||||
} else if (OPAL_SUCCESS != rc) {
|
||||
/* silently skip this component */
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no module was returned, then skip component
|
||||
|
@ -224,31 +224,26 @@ static int rte_init(void)
|
||||
* process stats if requested
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "opal_pstat_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "opal_pstat_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* open and setup the state machine */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_state_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_state_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* open the errmgr */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr_base_open";
|
||||
goto error;
|
||||
}
|
||||
@ -259,26 +254,26 @@ static int rte_init(void)
|
||||
* first and select that component.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_plm_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_plm_base_select";
|
||||
if (ORTE_ERR_FATAL == ret) {
|
||||
/* we already output a show_help - so keep down the verbage */
|
||||
ret = ORTE_ERR_SILENT;
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
/* if we were spawned by a singleton, our jobid was given to us */
|
||||
if (NULL != orte_ess_base_jobid) {
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "convert_string_to_jobid";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = 0;
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_plm_set_hnp_name";
|
||||
goto error;
|
||||
}
|
||||
@ -304,7 +299,6 @@ static int rte_init(void)
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename, NULL,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_session_dir define";
|
||||
goto error;
|
||||
}
|
||||
@ -318,7 +312,6 @@ static int rte_init(void)
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename, NULL,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_session_dir";
|
||||
goto error;
|
||||
}
|
||||
@ -329,12 +322,10 @@ static int rte_init(void)
|
||||
* OOB Layer
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_oob_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_oob_base_select";
|
||||
goto error;
|
||||
}
|
||||
@ -343,30 +334,25 @@ static int rte_init(void)
|
||||
* Runtime Messaging Layer
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rml_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rml_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Messaging QoS Layer */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_qos_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_qos_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_qos_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -32,43 +33,26 @@
|
||||
|
||||
|
||||
/**
|
||||
* Function for selecting one component from all those that are
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
|
||||
int orte_plm_base_select(void)
|
||||
{
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
int rc;
|
||||
orte_plm_base_component_t *best_component = NULL;
|
||||
orte_plm_base_module_t *best_module = NULL;
|
||||
|
||||
/*
|
||||
* Select the best component
|
||||
*/
|
||||
if( OPAL_SUCCESS != mca_base_select("plm", orte_plm_base_framework.framework_output,
|
||||
&orte_plm_base_framework.framework_components,
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component) ) {
|
||||
/* This will only happen if no component was selected
|
||||
*
|
||||
* If we didn't find one, and we are a daemon, then default to retaining the proxy.
|
||||
* Otherwise, if we didn't find one to select, that is unacceptable.
|
||||
*/
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
/* don't record a selected component or flag selected
|
||||
* so we finalize correctly - just leave the plm alone
|
||||
* as it defaults to pointing at the proxy
|
||||
*/
|
||||
goto cleanup;
|
||||
} else {
|
||||
exit_status = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
if (OPAL_SUCCESS == (rc = mca_base_select("plm", orte_plm_base_framework.framework_output,
|
||||
&orte_plm_base_framework.framework_components,
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component))) {
|
||||
/* Save the winner */
|
||||
orte_plm = *best_module;
|
||||
}
|
||||
|
||||
/* Save the winner */
|
||||
orte_plm = *best_module;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
return rc;
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -18,6 +19,14 @@
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[agent-not-found]
|
||||
The value of the MCA parameter "plm_rsh_agent" was set to a path
|
||||
that could not be found:
|
||||
|
||||
plm_rsh_agent: %s
|
||||
|
||||
Please either unset the parameter, or check that the path is correct
|
||||
#
|
||||
[no-local-orted]
|
||||
The rsh PLS component was not able to find the executable "orted" in
|
||||
your PATH or in the directory where Open MPI/OpenRTE was initially installed,
|
||||
|
@ -59,6 +59,8 @@ struct orte_plm_rsh_component_t {
|
||||
bool no_tree_spawn;
|
||||
int num_concurrent;
|
||||
char *agent;
|
||||
char *agent_path;
|
||||
char **agent_argv;
|
||||
bool assume_same_shell;
|
||||
bool pass_environ_mca_params;
|
||||
char *ssh_args;
|
||||
|
@ -43,8 +43,10 @@
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/path.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -228,6 +230,7 @@ static int rsh_component_open(void)
|
||||
/* initialize globals */
|
||||
mca_plm_rsh_component.using_qrsh = false;
|
||||
mca_plm_rsh_component.using_llspawn = false;
|
||||
mca_plm_rsh_component.agent_argv = NULL;
|
||||
|
||||
/* lookup parameters */
|
||||
if (mca_plm_rsh_component.num_concurrent <= 0) {
|
||||
@ -256,48 +259,59 @@ static int rsh_component_query(mca_base_module_t **module, int *priority)
|
||||
|
||||
/* Check if we are under Grid Engine parallel environment by looking at several
|
||||
* environment variables. If so, setup the path and argv[0]. */
|
||||
if (!mca_plm_rsh_component.disable_qrsh &&
|
||||
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
|
||||
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
|
||||
/* setup the search path for qrsh */
|
||||
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
/* see if the agent is available */
|
||||
if (ORTE_SUCCESS != rsh_launch_agent_lookup("qrsh", tmp)) {
|
||||
/* can't be SGE */
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: unable to be used: SGE indicated but cannot find path "
|
||||
"or execution permissions not set for launching agent qrsh",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
free(tmp);
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
if (NULL == mca_plm_rsh_component.agent) {
|
||||
if (!mca_plm_rsh_component.disable_qrsh &&
|
||||
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
|
||||
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
|
||||
/* setup the search path for qrsh */
|
||||
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
/* see if the agent is available */
|
||||
if (ORTE_SUCCESS != rsh_launch_agent_lookup("qrsh", tmp)) {
|
||||
/* can't be SGE */
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: unable to be used: SGE indicated but cannot find path "
|
||||
"or execution permissions not set for launching agent qrsh",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
free(tmp);
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_plm_rsh_component.agent = tmp;
|
||||
mca_plm_rsh_component.using_qrsh = true;
|
||||
/* no tree spawn allowed under qrsh */
|
||||
mca_plm_rsh_component.no_tree_spawn = true;
|
||||
goto success;
|
||||
} else if (!mca_plm_rsh_component.disable_llspawn &&
|
||||
NULL != getenv("LOADL_STEP_ID")) {
|
||||
/* We are running as a LOADLEVELER job.
|
||||
* Search for llspawn in the users PATH */
|
||||
if (ORTE_SUCCESS != rsh_launch_agent_lookup("llspawn", NULL)) {
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: unable to be used: LoadLeveler "
|
||||
"indicated but cannot find path or execution "
|
||||
"permissions not set for launching agent llspawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_plm_rsh_component.agent = strdup("llspawn");
|
||||
mca_plm_rsh_component.using_llspawn = true;
|
||||
goto success;
|
||||
}
|
||||
free(tmp);
|
||||
mca_plm_rsh_component.using_qrsh = true;
|
||||
/* no tree spawn allowed under qrsh */
|
||||
mca_plm_rsh_component.no_tree_spawn = true;
|
||||
goto success;
|
||||
} else if (!mca_plm_rsh_component.disable_llspawn &&
|
||||
NULL != getenv("LOADL_STEP_ID")) {
|
||||
/* We are running as a LOADLEVELER job.
|
||||
Search for llspawn in the users PATH */
|
||||
if (ORTE_SUCCESS != rsh_launch_agent_lookup("llspawn", NULL)) {
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: unable to be used: LoadLeveler "
|
||||
"indicated but cannot find path or execution "
|
||||
"permissions not set for launching agent llspawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_plm_rsh_component.using_llspawn = true;
|
||||
goto success;
|
||||
}
|
||||
|
||||
/* if this isn't an Grid Engine or LoadLeveler environment,
|
||||
see if MCA-specified agent (default: ssh:rsh) is available */
|
||||
/* if this isn't an Grid Engine or LoadLeveler environment, or
|
||||
* if the user specified a launch agent, look for it */
|
||||
|
||||
if (ORTE_SUCCESS != rsh_launch_agent_lookup(NULL, NULL)) {
|
||||
/* if the user specified an agent and we couldn't find it,
|
||||
* then we want to error out and not continue */
|
||||
if (NULL != mca_plm_rsh_component.agent) {
|
||||
orte_show_help("help-plm-rsh.txt", "agent-not-found", true,
|
||||
mca_plm_rsh_component.agent);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
/* this isn't an error - we just cannot be selected */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: unable to be used: cannot find path "
|
||||
@ -380,19 +394,48 @@ char **orte_plm_rsh_search(const char* agent_list, const char *path)
|
||||
|
||||
static int rsh_launch_agent_lookup(const char *agent_list, char *path)
|
||||
{
|
||||
char **tmp;
|
||||
char *bname;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh_lookup on agent %s path %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == agent_list) ? mca_plm_rsh_component.agent : agent_list,
|
||||
(NULL == path) ? "NULL" : path));
|
||||
if (NULL == (tmp = orte_plm_rsh_search(agent_list, path))) {
|
||||
if (NULL == (mca_plm_rsh_component.agent_argv = orte_plm_rsh_search(agent_list, path))) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* if we got here, then one of the given agents could be found */
|
||||
opal_argv_free(tmp);
|
||||
/* if we got here, then one of the given agents could be found - the
|
||||
* complete path is in the argv[0] position */
|
||||
mca_plm_rsh_component.agent_path = strdup(mca_plm_rsh_component.agent_argv[0]);
|
||||
bname = opal_basename(mca_plm_rsh_component.agent_argv[0]);
|
||||
if (NULL == bname) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* replace the initial position with the basename */
|
||||
free(mca_plm_rsh_component.agent_argv[0]);
|
||||
mca_plm_rsh_component.agent_argv[0] = bname;
|
||||
/* see if we need to add an xterm argument */
|
||||
if (0 == strcmp(bname, "ssh")) {
|
||||
/* if xterm option was given, add '-X', ensuring we don't do it twice */
|
||||
if (NULL != orte_xterm) {
|
||||
opal_argv_append_unique_nosize(&mca_plm_rsh_component.agent_argv, "-X", false);
|
||||
} else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
|
||||
/* if debug was not specified, and the user didn't explicitly
|
||||
* specify X11 forwarding/non-forwarding, add "-x" if it
|
||||
* isn't already there (check either case)
|
||||
*/
|
||||
for (i = 1; NULL != mca_plm_rsh_component.agent_argv[i]; ++i) {
|
||||
if (0 == strcasecmp("-x", mca_plm_rsh_component.agent_argv[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == mca_plm_rsh_component.agent_argv[i]) {
|
||||
opal_argv_append_nosize(&mca_plm_rsh_component.agent_argv, "-x");
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -164,7 +164,6 @@ static const char *orte_plm_rsh_shell_name[7] = {
|
||||
*/
|
||||
static void set_handler_default(int sig);
|
||||
static orte_plm_rsh_shell_t find_shell(char *shell);
|
||||
static int launch_agent_setup(const char *agent, char *path);
|
||||
static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__;
|
||||
static int rsh_probe(char *nodename,
|
||||
orte_plm_rsh_shell_t *shell);
|
||||
@ -175,8 +174,6 @@ static void launch_daemons(int fd, short args, void *cbdata);
|
||||
static void process_launch_list(int fd, short args, void *cbdata);
|
||||
|
||||
/* local global storage */
|
||||
static char *rsh_agent_path=NULL;
|
||||
static char **rsh_agent_argv=NULL;
|
||||
static int num_in_progress=0;
|
||||
static opal_list_t launch_list;
|
||||
static opal_event_t launch_event;
|
||||
@ -186,51 +183,8 @@ static opal_event_t launch_event;
|
||||
*/
|
||||
static int rsh_init(void)
|
||||
{
|
||||
char *tmp;
|
||||
int rc;
|
||||
|
||||
/* we were selected, so setup the launch agent */
|
||||
if (mca_plm_rsh_component.using_qrsh) {
|
||||
/* perform base setup for qrsh */
|
||||
(void)asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
if (ORTE_SUCCESS != (rc = launch_agent_setup("qrsh", tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
/* automatically add -inherit and grid engine PE related flags */
|
||||
opal_argv_append_nosize(&rsh_agent_argv, "-inherit");
|
||||
/* Don't use the "-noshell" flag as qrsh would have a problem
|
||||
* swallowing a long command */
|
||||
opal_argv_append_nosize(&rsh_agent_argv, "-nostdin");
|
||||
opal_argv_append_nosize(&rsh_agent_argv, "-V");
|
||||
if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
|
||||
opal_argv_append_nosize(&rsh_agent_argv, "-verbose");
|
||||
tmp = opal_argv_join(rsh_agent_argv, ' ');
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: using \"%s\" for launching\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
|
||||
free(tmp);
|
||||
}
|
||||
} else if(mca_plm_rsh_component.using_llspawn) {
|
||||
/* perform base setup for llspawn */
|
||||
if (ORTE_SUCCESS != (rc = launch_agent_setup("llspawn", NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
opal_output_verbose(1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: using \"%s\" for launching\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
rsh_agent_path);
|
||||
} else {
|
||||
/* not using qrsh or llspawn - use MCA-specified agent */
|
||||
if (ORTE_SUCCESS != (rc = launch_agent_setup(mca_plm_rsh_component.agent, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* point to our launch command */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
|
||||
launch_daemons, ORTE_SYS_PRI))) {
|
||||
@ -371,8 +325,8 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
/*
|
||||
* Build argv array
|
||||
*/
|
||||
argv = opal_argv_copy(rsh_agent_argv);
|
||||
argc = opal_argv_count(rsh_agent_argv);
|
||||
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
|
||||
argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
|
||||
/* if any ssh args were provided, now is the time to add them */
|
||||
if (NULL != mca_plm_rsh_component.ssh_args) {
|
||||
char **ssh_argv;
|
||||
@ -722,7 +676,7 @@ static void ssh_child(int argc, char **argv)
|
||||
* about remote launches here
|
||||
*/
|
||||
exec_argv = argv;
|
||||
exec_path = strdup(rsh_agent_path);
|
||||
exec_path = mca_plm_rsh_component.agent_path;
|
||||
|
||||
/* Don't let ssh slurp all of our stdin! */
|
||||
fdin = open("/dev/null", O_RDWR);
|
||||
@ -1377,67 +1331,6 @@ static orte_plm_rsh_shell_t find_shell(char *shell)
|
||||
return ORTE_PLM_RSH_SHELL_UNKNOWN;
|
||||
}
|
||||
|
||||
static int launch_agent_setup(const char *agent, char *path)
|
||||
{
|
||||
char *bname;
|
||||
int i;
|
||||
|
||||
/* if no agent was provided, then report not found */
|
||||
if (NULL == mca_plm_rsh_component.agent && NULL == agent) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* search for the argv */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh_setup on agent %s path %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == agent) ? mca_plm_rsh_component.agent : agent,
|
||||
(NULL == path) ? "NULL" : path));
|
||||
rsh_agent_argv = orte_plm_rsh_search(agent, path);
|
||||
|
||||
if (0 == opal_argv_count(rsh_agent_argv)) {
|
||||
/* nothing was found */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* see if we can find the agent in the path */
|
||||
rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path);
|
||||
|
||||
if (NULL == rsh_agent_path) {
|
||||
/* not an error - just report not found */
|
||||
opal_argv_free(rsh_agent_argv);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
bname = opal_basename(rsh_agent_argv[0]);
|
||||
if (NULL == bname) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == strcmp(bname, "ssh")) {
|
||||
/* if xterm option was given, add '-X', ensuring we don't do it twice */
|
||||
if (NULL != orte_xterm) {
|
||||
opal_argv_append_unique_nosize(&rsh_agent_argv, "-X", false);
|
||||
} else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
|
||||
/* if debug was not specified, and the user didn't explicitly
|
||||
* specify X11 forwarding/non-forwarding, add "-x" if it
|
||||
* isn't already there (check either case)
|
||||
*/
|
||||
for (i = 1; NULL != rsh_agent_argv[i]; ++i) {
|
||||
if (0 == strcasecmp("-x", rsh_agent_argv[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == rsh_agent_argv[i]) {
|
||||
opal_argv_append_nosize(&rsh_agent_argv, "-x");
|
||||
}
|
||||
}
|
||||
}
|
||||
free(bname);
|
||||
|
||||
/* the caller can append any additional argv's they desire */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the Shell variable and system type on the specified node
|
||||
*/
|
||||
@ -1479,8 +1372,8 @@ static int rsh_probe(char *nodename,
|
||||
exit(01);
|
||||
}
|
||||
/* Build argv array */
|
||||
argv = opal_argv_copy(rsh_agent_argv);
|
||||
argc = opal_argv_count(rsh_agent_argv);
|
||||
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
|
||||
argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
|
||||
opal_argv_append(&argc, &argv, nodename);
|
||||
opal_argv_append(&argc, &argv, "echo $SHELL");
|
||||
|
||||
|
@ -556,11 +556,10 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
NULL == orte_process_info.top_session_dir) {
|
||||
/* this should never happen - it means we are calling
|
||||
* cleanup *before* properly setting up the session
|
||||
* dir system. This leaves open the possibility of
|
||||
* dir system. Protect against the possibility of
|
||||
* accidentally removing directories we shouldn't
|
||||
* touch
|
||||
* touch by returning
|
||||
*/
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_INITIALIZED);
|
||||
return ORTE_ERR_NOT_INITIALIZED;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user