diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c index f6f80fad77..8f316eadf6 100644 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ b/orte/mca/plm/base/plm_base_rsh_support.c @@ -63,20 +63,43 @@ #include "orte/mca/plm/base/plm_private.h" #ifndef __WINDOWS__ -static char **search(const char* agent_list); +static char **search(const char* agent_list, const char *path); -int orte_plm_base_rsh_launch_agent_setup(void) +int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path) +{ + char **tmp; + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:rsh_lookup on agent %s path %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == agent_list) ? orte_rsh_agent : agent_list, + (NULL == path) ? "NULL" : path)); + if (NULL == (tmp = search(agent_list, path))) { + return ORTE_ERR_NOT_FOUND; + } + + /* if we got here, then one of the given agents could be found */ + opal_argv_free(tmp); + return ORTE_SUCCESS; +} + +int orte_plm_base_rsh_launch_agent_setup(const char *agent, char *path) { char *bname; int i; /* if no agent was provided, then report not found */ - if (NULL == orte_rsh_agent) { + if (NULL == orte_rsh_agent && NULL == agent) { return ORTE_ERR_NOT_FOUND; } - /* Take the orte_rsh_agent MCA param and search for the argv */ - orte_plm_globals.rsh_agent_argv = search(orte_rsh_agent); + /* search for the argv */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:rsh_setup on agent %s path %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == agent) ? orte_rsh_agent : agent, + (NULL == path) ? "NULL" : path)); + orte_plm_globals.rsh_agent_argv = search(agent, path); if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) { /* nothing was found */ @@ -85,8 +108,8 @@ int orte_plm_base_rsh_launch_agent_setup(void) /* see if we can find the agent in the path */ orte_plm_globals.rsh_agent_path = - opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, NULL); + opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, + environ, path); if (NULL == orte_plm_globals.rsh_agent_path) { /* not an error - just report not found */ @@ -116,6 +139,7 @@ int orte_plm_base_rsh_launch_agent_setup(void) } } + /* the caller can append any additional argv's they desire */ return ORTE_SUCCESS; } @@ -288,14 +312,23 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata) * we are able to find in the PATH. Split that one into argv and * return it. If nothing found, then return NULL. */ -static char **search(const char* agent_list) +static char **search(const char* agent_list, const char *path) { int i, j; - char *line, **lines = opal_argv_split(agent_list, ':'); + char *line, **lines; char **tokens, *tmp; char cwd[OPAL_PATH_MAX]; - getcwd(cwd, OPAL_PATH_MAX); + if (NULL == path) { + getcwd(cwd, OPAL_PATH_MAX); + } else { + strncpy(cwd, path, OPAL_PATH_MAX); + } + if (NULL == agent_list) { + lines = opal_argv_split(orte_rsh_agent, ':'); + } else { + lines = opal_argv_split(agent_list, ':'); + } for (i = 0; NULL != lines[i]; ++i) { line = lines[i]; diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 3a5e20c796..32766b590d 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -121,7 +121,8 @@ ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv); * Local slave launch */ ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata); -ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(void); +ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(const char *agent_list, char *path); +ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path); ORTE_DECLSPEC void orte_plm_base_local_slave_finalize(void); ORTE_DECLSPEC int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app, char *rcmd, char ***argv, char **exec_path); diff --git a/orte/mca/plm/rsh/plm_rsh_component.c b/orte/mca/plm/rsh/plm_rsh_component.c index f6ae8061ec..e99706b944 100644 --- a/orte/mca/plm/rsh/plm_rsh_component.c +++ b/orte/mca/plm/rsh/plm_rsh_component.c @@ -155,46 +155,27 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority) if (!mca_plm_rsh_component.disable_qrsh && NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") && NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) { - /* setting rsh_agent_path and rsh_agent_argv[0] for qrsh */ - asprintf(&orte_plm_globals.rsh_agent_path, "%s/bin/%s/qrsh", - getenv("SGE_ROOT"), getenv("ARC")); - orte_plm_globals.rsh_agent_argv = NULL; - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, - orte_plm_globals.rsh_agent_path); - /* double check that we have access and permissions for the qrsh agent */ - if (NULL == opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, NULL)) { - opal_output_verbose(1, orte_plm_globals.output, - "%s plm:rsh: unable to be used: cannot find path " - "or execution permissions not set for launching agent \"%s\"\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_plm_globals.rsh_agent_argv[0]); + /* setup the search path for qrsh */ + asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC")); + /* see if the agent is available */ + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("qrsh", tmp)) { + /* can't be SGE */ + opal_output_verbose(1, orte_plm_globals.output, + "%s plm:rsh: unable to be used: SGE indicated but cannot find path " + "or execution permissions not set for launching agent qrsh", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); *module = NULL; return ORTE_ERROR; } - /* automatically add -inherit and grid engine PE related flags */ - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit"); - /* Don't use the "-noshell" flag as qrsh would have a problem - * swallowing a long command */ - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin"); - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V"); - if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose"); - tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' '); - opal_output_verbose(1, orte_plm_globals.output, - "%s plm:rsh: using \"%s\" for launching\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); - free(tmp); - } mca_plm_rsh_component.using_qrsh = true; *priority = mca_plm_rsh_component.priority; *module = (mca_base_module_t *) &orte_plm_rsh_module; return ORTE_SUCCESS; } - /* if this isn't an Grid Engine environment, see if rsh/ssh is available */ + /* if this isn't an Grid Engine environment, see if MCA-specified agent (default: ssh:rsh) is available */ - if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) { /* this isn't an error - we just cannot be selected */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: unable to be used: cannot find path " diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 487e6a270d..57f6345dd1 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -150,8 +150,41 @@ static orte_jobid_t local_slaves; */ int orte_plm_rsh_init(void) { + char *tmp; int rc; + /* we were selected, so setup the launch agent */ + if (mca_plm_rsh_component.using_qrsh) { + /* perform base setup for qrsh */ + asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC")); + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup("qrsh", tmp))) { + ORTE_ERROR_LOG(rc); + free(tmp); + return rc; + } + free(tmp); + /* automatically add -inherit and grid engine PE related flags */ + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit"); + /* Don't use the "-noshell" flag as qrsh would have a problem + * swallowing a long command */ + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin"); + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V"); + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { + opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose"); + tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' '); + opal_output_verbose(1, orte_plm_globals.output, + "%s plm:rsh: using \"%s\" for launching\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + free(tmp); + } + } else { + /* not using qrsh - use MCA-specified agent */ + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/rshd/plm_rshd_component.c b/orte/mca/plm/rshd/plm_rshd_component.c index cf302f64cc..407dd7c975 100644 --- a/orte/mca/plm/rshd/plm_rshd_component.c +++ b/orte/mca/plm/rshd/plm_rshd_component.c @@ -123,18 +123,16 @@ int orte_plm_rshd_component_open(void) int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority) { - if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { + if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) { /* this isn't an error - we just cannot be selected */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rshd: unable to be used: cannot find path " - "for launching agent \"%s\"\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_rsh_agent)); + "%s plm:rshd: unable to be used: cannot find \"%s\" in PATH", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rsh_agent)); *module = NULL; return ORTE_ERROR; } - /* we are good - make ourselves available, but only if selected */ + /* we are good - make ourselves available, but at low priority */ *priority = 0; *module = (mca_base_module_t *) &orte_plm_rshd_module; return ORTE_SUCCESS; diff --git a/orte/mca/plm/rshd/plm_rshd_module.c b/orte/mca/plm/rshd/plm_rshd_module.c index db6ad9434f..831f5ae92d 100644 --- a/orte/mca/plm/rshd/plm_rshd_module.c +++ b/orte/mca/plm/rshd/plm_rshd_module.c @@ -117,6 +117,12 @@ int orte_plm_rshd_init(void) { int rc; + /* since I was selected, setup the rsh launch agent support */ + if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); }