1
1

Resolve the rsh confusion by splitting the initial search for a launch agent from the actual setup of the launch agent values in the plm base globals. Have each aspiring rsh-clone call lookup to see if their desired launch agent is available - if not, then reject that plm component.

If so, then setup the actual launch agent values only when the module init function is called.

This resolves the current conflict between the rsh and rshd components. Hopefully, it may avoid future problems in this area -provided- any new uses of rsh-like launchers abide by the lookup-and-then-setup rule.

This commit was SVN r24550.
Этот коммит содержится в:
Ralph Castain 2011-03-22 02:23:09 +00:00
родитель d17b50e1ff
Коммит c1396b278c
6 изменённых файлов: 99 добавлений и 47 удалений

Просмотреть файл

@ -63,20 +63,43 @@
#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/plm_private.h"
#ifndef __WINDOWS__ #ifndef __WINDOWS__
static char **search(const char* agent_list); static char **search(const char* agent_list, const char *path);
int orte_plm_base_rsh_launch_agent_setup(void) int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path)
{
char **tmp;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:rsh_lookup on agent %s path %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == agent_list) ? orte_rsh_agent : agent_list,
(NULL == path) ? "NULL" : path));
if (NULL == (tmp = search(agent_list, path))) {
return ORTE_ERR_NOT_FOUND;
}
/* if we got here, then one of the given agents could be found */
opal_argv_free(tmp);
return ORTE_SUCCESS;
}
int orte_plm_base_rsh_launch_agent_setup(const char *agent, char *path)
{ {
char *bname; char *bname;
int i; int i;
/* if no agent was provided, then report not found */ /* if no agent was provided, then report not found */
if (NULL == orte_rsh_agent) { if (NULL == orte_rsh_agent && NULL == agent) {
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
/* Take the orte_rsh_agent MCA param and search for the argv */ /* search for the argv */
orte_plm_globals.rsh_agent_argv = search(orte_rsh_agent); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:rsh_setup on agent %s path %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == agent) ? orte_rsh_agent : agent,
(NULL == path) ? "NULL" : path));
orte_plm_globals.rsh_agent_argv = search(agent, path);
if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) { if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) {
/* nothing was found */ /* nothing was found */
@ -85,8 +108,8 @@ int orte_plm_base_rsh_launch_agent_setup(void)
/* see if we can find the agent in the path */ /* see if we can find the agent in the path */
orte_plm_globals.rsh_agent_path = orte_plm_globals.rsh_agent_path =
opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK,
environ, NULL); environ, path);
if (NULL == orte_plm_globals.rsh_agent_path) { if (NULL == orte_plm_globals.rsh_agent_path) {
/* not an error - just report not found */ /* not an error - just report not found */
@ -116,6 +139,7 @@ int orte_plm_base_rsh_launch_agent_setup(void)
} }
} }
/* the caller can append any additional argv's they desire */
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -288,14 +312,23 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
* we are able to find in the PATH. Split that one into argv and * we are able to find in the PATH. Split that one into argv and
* return it. If nothing found, then return NULL. * return it. If nothing found, then return NULL.
*/ */
static char **search(const char* agent_list) static char **search(const char* agent_list, const char *path)
{ {
int i, j; int i, j;
char *line, **lines = opal_argv_split(agent_list, ':'); char *line, **lines;
char **tokens, *tmp; char **tokens, *tmp;
char cwd[OPAL_PATH_MAX]; char cwd[OPAL_PATH_MAX];
getcwd(cwd, OPAL_PATH_MAX); if (NULL == path) {
getcwd(cwd, OPAL_PATH_MAX);
} else {
strncpy(cwd, path, OPAL_PATH_MAX);
}
if (NULL == agent_list) {
lines = opal_argv_split(orte_rsh_agent, ':');
} else {
lines = opal_argv_split(agent_list, ':');
}
for (i = 0; NULL != lines[i]; ++i) { for (i = 0; NULL != lines[i]; ++i) {
line = lines[i]; line = lines[i];

Просмотреть файл

@ -121,7 +121,8 @@ ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv);
* Local slave launch * Local slave launch
*/ */
ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata); ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata);
ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(void); ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(const char *agent_list, char *path);
ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path);
ORTE_DECLSPEC void orte_plm_base_local_slave_finalize(void); ORTE_DECLSPEC void orte_plm_base_local_slave_finalize(void);
ORTE_DECLSPEC int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app, ORTE_DECLSPEC int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
char *rcmd, char ***argv, char **exec_path); char *rcmd, char ***argv, char **exec_path);

Просмотреть файл

@ -155,46 +155,27 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
if (!mca_plm_rsh_component.disable_qrsh && if (!mca_plm_rsh_component.disable_qrsh &&
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") && NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) { NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
/* setting rsh_agent_path and rsh_agent_argv[0] for qrsh */ /* setup the search path for qrsh */
asprintf(&orte_plm_globals.rsh_agent_path, "%s/bin/%s/qrsh", asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
getenv("SGE_ROOT"), getenv("ARC")); /* see if the agent is available */
orte_plm_globals.rsh_agent_argv = NULL; if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("qrsh", tmp)) {
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, /* can't be SGE */
orte_plm_globals.rsh_agent_path); opal_output_verbose(1, orte_plm_globals.output,
/* double check that we have access and permissions for the qrsh agent */ "%s plm:rsh: unable to be used: SGE indicated but cannot find path "
if (NULL == opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, "or execution permissions not set for launching agent qrsh",
environ, NULL)) { ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: cannot find path "
"or execution permissions not set for launching agent \"%s\"\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_plm_globals.rsh_agent_argv[0]);
*module = NULL; *module = NULL;
return ORTE_ERROR; return ORTE_ERROR;
} }
/* automatically add -inherit and grid engine PE related flags */
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit");
/* Don't use the "-noshell" flag as qrsh would have a problem
* swallowing a long command */
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin");
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V");
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose");
tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' ');
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: using \"%s\" for launching\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
free(tmp);
}
mca_plm_rsh_component.using_qrsh = true; mca_plm_rsh_component.using_qrsh = true;
*priority = mca_plm_rsh_component.priority; *priority = mca_plm_rsh_component.priority;
*module = (mca_base_module_t *) &orte_plm_rsh_module; *module = (mca_base_module_t *) &orte_plm_rsh_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* if this isn't an Grid Engine environment, see if rsh/ssh is available */ /* if this isn't an Grid Engine environment, see if MCA-specified agent (default: ssh:rsh) is available */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
/* this isn't an error - we just cannot be selected */ /* this isn't an error - we just cannot be selected */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: cannot find path " "%s plm:rsh: unable to be used: cannot find path "

Просмотреть файл

@ -150,8 +150,41 @@ static orte_jobid_t local_slaves;
*/ */
int orte_plm_rsh_init(void) int orte_plm_rsh_init(void)
{ {
char *tmp;
int rc; int rc;
/* we were selected, so setup the launch agent */
if (mca_plm_rsh_component.using_qrsh) {
/* perform base setup for qrsh */
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup("qrsh", tmp))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
free(tmp);
/* automatically add -inherit and grid engine PE related flags */
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit");
/* Don't use the "-noshell" flag as qrsh would have a problem
* swallowing a long command */
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin");
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V");
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose");
tmp = opal_argv_join(orte_plm_globals.rsh_agent_argv, ' ');
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: using \"%s\" for launching\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
free(tmp);
}
} else {
/* not using qrsh - use MCA-specified agent */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }

Просмотреть файл

@ -123,18 +123,16 @@ int orte_plm_rshd_component_open(void)
int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority) int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority)
{ {
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) { if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
/* this isn't an error - we just cannot be selected */ /* this isn't an error - we just cannot be selected */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshd: unable to be used: cannot find path " "%s plm:rshd: unable to be used: cannot find \"%s\" in PATH",
"for launching agent \"%s\"\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rsh_agent));
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_rsh_agent));
*module = NULL; *module = NULL;
return ORTE_ERROR; return ORTE_ERROR;
} }
/* we are good - make ourselves available, but only if selected */ /* we are good - make ourselves available, but at low priority */
*priority = 0; *priority = 0;
*module = (mca_base_module_t *) &orte_plm_rshd_module; *module = (mca_base_module_t *) &orte_plm_rshd_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -117,6 +117,12 @@ int orte_plm_rshd_init(void)
{ {
int rc; int rc;
/* since I was selected, setup the rsh launch agent support */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }