From c1396b278c007df4ed39d98e4bd14ae7d543e05b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 22 Mar 2011 02:23:09 +0000 Subject: [PATCH] Resolve the rsh confusion by splitting the initial search for a launch agent from the actual setup of the launch agent values in the plm base globals. Have each aspiring rsh-clone call lookup to see if their desired launch agent is available - if not, then reject that plm component. If so, then setup the actual launch agent values only when the module init function is called. This resolves the current conflict between the rsh and rshd components. Hopefully, it may avoid future problems in this area -provided- any new uses of rsh-like launchers abide by the lookup-and-then-setup rule. This commit was SVN r24550. --- orte/mca/plm/base/plm_base_rsh_support.c | 53 +++++++++++++++++++----- orte/mca/plm/base/plm_private.h | 3 +- orte/mca/plm/rsh/plm_rsh_component.c | 41 +++++------------- orte/mca/plm/rsh/plm_rsh_module.c | 33 +++++++++++++++ orte/mca/plm/rshd/plm_rshd_component.c | 10 ++--- orte/mca/plm/rshd/plm_rshd_module.c | 6 +++ 6 files changed, 99 insertions(+), 47 deletions(-) diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c index f6f80fad77..8f316eadf6 100644 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ b/orte/mca/plm/base/plm_base_rsh_support.c @@ -63,20 +63,43 @@ #include "orte/mca/plm/base/plm_private.h" #ifndef __WINDOWS__ -static char **search(const char* agent_list); +static char **search(const char* agent_list, const char *path); -int orte_plm_base_rsh_launch_agent_setup(void) +int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path) +{ + char **tmp; + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:rsh_lookup on agent %s path %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == agent_list) ? orte_rsh_agent : agent_list, + (NULL == path) ? "NULL" : path)); + if (NULL == (tmp = search(agent_list, path))) { + return ORTE_ERR_NOT_FOUND; + } + + /* if we got here, then one of the given agents could be found */ + opal_argv_free(tmp); + return ORTE_SUCCESS; +} + +int orte_plm_base_rsh_launch_agent_setup(const char *agent, char *path) { char *bname; int i; /* if no agent was provided, then report not found */ - if (NULL == orte_rsh_agent) { + if (NULL == orte_rsh_agent && NULL == agent) { return ORTE_ERR_NOT_FOUND; } - /* Take the orte_rsh_agent MCA param and search for the argv */ - orte_plm_globals.rsh_agent_argv = search(orte_rsh_agent); + /* search for the argv */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:rsh_setup on agent %s path %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == agent) ? orte_rsh_agent : agent, + (NULL == path) ? "NULL" : path)); + orte_plm_globals.rsh_agent_argv = search(agent, path); if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) { /* nothing was found */ @@ -85,8 +108,8 @@ int orte_plm_base_rsh_launch_agent_setup(void) /* see if we can find the agent in the path */ orte_plm_globals.rsh_agent_path = - opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, NULL); + opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, + environ, path); if (NULL == orte_plm_globals.rsh_agent_path) { /* not an error - just report not found */ @@ -116,6 +139,7 @@ int orte_plm_base_rsh_launch_agent_setup(void) } } + /* the caller can append any additional argv's they desire */ return ORTE_SUCCESS; } @@ -288,14 +312,23 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata) * we are able to find in the PATH. Split that one into argv and * return it. If nothing found, then return NULL. */ -static char **search(const char* agent_list) +static char **search(const char* agent_list, const char *path) { int i, j; - char *line, **lines = opal_argv_split(agent_list, ':'); + char *line, **lines; char **tokens, *tmp; char cwd[OPAL_PATH_MAX]; - getcwd(cwd, OPAL_PATH_MAX); + if (NULL == path) { + getcwd(cwd, OPAL_PATH_MAX); + } else { + strncpy(cwd, path, OPAL_PATH_MAX); + } + if (NULL == agent_list) { + lines = opal_argv_split(orte_rsh_agent, ':'); + } else { + lines = opal_argv_split(agent_list, ':'); + } for (i = 0; NULL != lines[i]; ++i) { line = lines[i]; diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 3a5e20c796..32766b590d 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -121,7 +121,8 @@ ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv); * Local slave launch */ ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata); -ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(void); +ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(const char *agent_list, char *path); +ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path); ORTE_DECLSPEC void orte_plm_base_local_slave_finalize(void); ORTE_DECLSPEC int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app, char *rcmd, char ***argv, char **exec_path); diff --git a/orte/mca/plm/rsh/plm_rsh_component.c b/orte/mca/plm/rsh/plm_rsh_component.c index f6ae8061ec..e99706b944 100644 --- a/orte/mca/plm/rsh/plm_rsh_component.c +++ b/orte/mca/plm/rsh/plm_rsh_component.c @@ -155,46 +155,27 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority) if (!mca_plm_rsh_component.disable_qrsh && NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") && NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) { - /* setting rsh_agent_path and rsh_agent_argv[0] for qrsh */ - asprintf(&orte_plm_globals.rsh_agent_path, "%s/bin/%s/qrsh", - getenv("SGE_ROOT"), getenv("ARC")); - orte_plm_globals.rsh_agent_argv = NULL; - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, - orte_plm_globals.rsh_agent_path); - /* double check that we have access and permissions for the qrsh agent */ - if (NULL == opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, NULL)) { - opal_output_verbose(1, orte_plm_globals.output, - "%s plm:rsh: unable to be used: cannot find path " - "or execution permissions not set for launching agent \"%s\"\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_plm_globals.rsh_agent_argv[0]); + /* setup the search path for qrsh */ + asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC")); + /* see if the agent is available */ + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("qrsh", tmp)) { + /* can't be SGE */ + opal_output_verbose(1, orte_plm_globals.output, + "%s plm:rsh: unable to be used: SGE indicated but cannot find path " + "or execution permissions not set for launching agent qrsh", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); *module = NULL; return ORTE_ERROR; } - /* automatically add -inherit and grid engine PE related flags */ - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit"); - /* Don't use the "-noshell" flag as qrsh would have a problem - * swallowing a long command */ - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin"); - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V"); - if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose"); - tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' '); - opal_output_verbose(1, orte_plm_globals.output, - "%s plm:rsh: using \"%s\" for launching\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); - free(tmp); - } mca_plm_rsh_component.using_qrsh = true; *priority = mca_plm_rsh_component.priority; *module = (mca_base_module_t *) &orte_plm_rsh_module; return ORTE_SUCCESS; } - /* if this isn't an Grid Engine environment, see if rsh/ssh is available */ + /* if this isn't an Grid Engine environment, see if MCA-specified agent (default: ssh:rsh) is available */ - if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) { /* this isn't an error - we just cannot be selected */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: unable to be used: cannot find path " diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 487e6a270d..57f6345dd1 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -150,8 +150,41 @@ static orte_jobid_t local_slaves; */ int orte_plm_rsh_init(void) { + char *tmp; int rc; + /* we were selected, so setup the launch agent */ + if (mca_plm_rsh_component.using_qrsh) { + /* perform base setup for qrsh */ + asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC")); + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup("qrsh", tmp))) { + ORTE_ERROR_LOG(rc); + free(tmp); + return rc; + } + free(tmp); + /* automatically add -inherit and grid engine PE related flags */ + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit"); + /* Don't use the "-noshell" flag as qrsh would have a problem + * swallowing a long command */ + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin"); + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V"); + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose"); + tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' '); + opal_output_verbose(1, orte_plm_globals.output, + "%s plm:rsh: using \"%s\" for launching\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + free(tmp); + } + } else { + /* not using qrsh - use MCA-specified agent */ + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/rshd/plm_rshd_component.c b/orte/mca/plm/rshd/plm_rshd_component.c index cf302f64cc..407dd7c975 100644 --- a/orte/mca/plm/rshd/plm_rshd_component.c +++ b/orte/mca/plm/rshd/plm_rshd_component.c @@ -123,18 +123,16 @@ int orte_plm_rshd_component_open(void) int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority) { - if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) { /* this isn't an error - we just cannot be selected */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rshd: unable to be used: cannot find path " - "for launching agent \"%s\"\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_rsh_agent)); + "%s plm:rshd: unable to be used: cannot find \"%s\" in PATH", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rsh_agent)); *module = NULL; return ORTE_ERROR; } - /* we are good - make ourselves available, but only if selected */ + /* we are good - make ourselves available, but at low priority */ *priority = 0; *module = (mca_base_module_t *) &orte_plm_rshd_module; return ORTE_SUCCESS; diff --git a/orte/mca/plm/rshd/plm_rshd_module.c b/orte/mca/plm/rshd/plm_rshd_module.c index db6ad9434f..831f5ae92d 100644 --- a/orte/mca/plm/rshd/plm_rshd_module.c +++ b/orte/mca/plm/rshd/plm_rshd_module.c @@ -117,6 +117,12 @@ int orte_plm_rshd_init(void) { int rc; + /* since I was selected, setup the rsh launch agent support */ + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); }