1
1

Since this has come up a number of times, have the rsh launcher add MCA params from the environment by default. If it finds that the cmd line is too long, error out with a message directing the user to set a param to ignore the environmental MCA params.

This commit was SVN r25581.
Этот коммит содержится в:
Ralph Castain 2011-12-07 01:24:36 +00:00
родитель 7510339725
Коммит 3e7ab1212a
5 изменённых файлов: 114 добавлений и 49 удалений

Просмотреть файл

@ -581,7 +581,8 @@ AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
sys/types.h sys/uio.h net/uio.h sys/utsname.h sys/vfs.h sys/wait.h syslog.h \
time.h termios.h ulimit.h unistd.h util.h utmp.h malloc.h \
ifaddrs.h sys/sysctl.h crt_externs.h regex.h signal.h \
ioLib.h sockLib.h hostLib.h shlwapi.h sys/synch.h limits.h db.h ndbm.h])
ioLib.h sockLib.h hostLib.h shlwapi.h sys/synch.h limits.h db.h ndbm.h \
sys/syslimits.h])
# Needed to work around Darwin requiring sys/socket.h for
# net/if.h

Просмотреть файл

@ -75,3 +75,13 @@ The prefix we were given are:
opal_prefix: %s
prefix_dir: %s
#
[cmd-line-too-long]
The cmd line to launch remote daemons is too long:
Length: %d
Max length: %d
Consider setting -mca plm_rsh_pass_environ_mca_params 0 to
avoid including any environmentally set MCA parameters on the
command line.

Просмотреть файл

@ -61,6 +61,7 @@ struct orte_plm_rsh_component_t {
opal_condition_t cond;
char *agent;
bool assume_same_shell;
bool pass_environ_mca_params;
};
typedef struct orte_plm_rsh_component_t orte_plm_rsh_component_t;

Просмотреть файл

@ -187,6 +187,11 @@ static int rsh_component_open(void)
mca_base_param_lookup_int(tmp, &value);
mca_plm_rsh_component.assume_same_shell = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int(c, "pass_environ_mca_params",
"If set to 0, do not include mca params from the environment on the orted cmd line",
false, false, 1, &tmp);
mca_plm_rsh_component.pass_environ_mca_params = OPAL_INT_TO_BOOL(tmp);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -62,6 +62,12 @@
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif
#if HAVE_LIMITS_H
#include <limits.h>
#endif
#if HAVE_SYS_SYSLIMITS_H
#include <sys/syslimits.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/mca_base_param.h"
@ -287,7 +293,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
{
int argc;
char **argv;
char *param;
char *param, *value;
orte_plm_rsh_shell_t remote_shell, local_shell;
char *lib_base, *bin_base;
int orted_argc;
@ -295,34 +301,35 @@ static int setup_launch(int *argcptr, char ***argvptr,
char *orted_cmd, *orted_prefix, *final_cmd;
int orted_index;
int rc;
int cnt, i, j;
bool found;
/* Figure out the basenames for the libdir and bindir. This
requires some explanation:
requires some explanation:
- Use opal_install_dirs.libdir and opal_install_dirs.bindir.
- Use opal_install_dirs.libdir and opal_install_dirs.bindir.
- After a discussion on the devel-core mailing list, the
developers decided that we should use the local directory
basenames as the basis for the prefix on the remote note.
This does not handle a few notable cases (e.g., if the
libdir/bindir is not simply a subdir under the prefix, if the
libdir/bindir basename is not the same on the remote node as
it is here on the local node, etc.), but we decided that
--prefix was meant to handle "the common case". If you need
something more complex than this, a) edit your shell startup
files to set PATH/LD_LIBRARY_PATH properly on the remove
node, or b) use some new/to-be-defined options that
explicitly allow setting the bindir/libdir on the remote
node. We decided to implement these options (e.g.,
--remote-bindir and --remote-libdir) to orterun when it
actually becomes a problem for someone (vs. a hypothetical
situation).
- After a discussion on the devel-core mailing list, the
developers decided that we should use the local directory
basenames as the basis for the prefix on the remote note.
This does not handle a few notable cases (e.g., if the
libdir/bindir is not simply a subdir under the prefix, if the
libdir/bindir basename is not the same on the remote node as
it is here on the local node, etc.), but we decided that
--prefix was meant to handle "the common case". If you need
something more complex than this, a) edit your shell startup
files to set PATH/LD_LIBRARY_PATH properly on the remove
node, or b) use some new/to-be-defined options that
explicitly allow setting the bindir/libdir on the remote
node. We decided to implement these options (e.g.,
--remote-bindir and --remote-libdir) to orterun when it
actually becomes a problem for someone (vs. a hypothetical
situation).
Hence, for now, we simply take the basename of this install's
libdir and bindir and use it to append this install's prefix
and use that on the remote node.
*/
Hence, for now, we simply take the basename of this install's
libdir and bindir and use it to append this install's prefix
and use that on the remote node.
*/
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
@ -429,13 +436,13 @@ static int setup_launch(int *argcptr, char ***argvptr,
} else if (ORTE_PLM_RSH_SHELL_TCSH == remote_shell ||
ORTE_PLM_RSH_SHELL_CSH == remote_shell) {
/* [t]csh is a bit more challenging -- we
have to check whether LD_LIBRARY_PATH
is already set before we try to set it.
Must be very careful about obeying
[t]csh's order of evaluation and not
using a variable before it is defined.
See this thread for more details:
http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
have to check whether LD_LIBRARY_PATH
is already set before we try to set it.
Must be very careful about obeying
[t]csh's order of evaluation and not
using a variable before it is defined.
See this thread for more details:
http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
/* if there is nothing preceding orted, then we can just
* assemble the cmd with the orted_cmd at the end. Otherwise,
* we have to insert the orted_prefix in the right place
@ -489,9 +496,9 @@ static int setup_launch(int *argcptr, char ***argvptr,
/* Daemonize when not using qrsh. Or, if using qrsh, only
* daemonize if told to by user with daemonize_qrsh flag. */
((!mca_plm_rsh_component.using_qrsh) ||
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
((!mca_plm_rsh_component.using_llspawn) ||
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
opal_argv_append(&argc, &argv, "--daemonize");
}
@ -513,25 +520,66 @@ static int setup_launch(int *argcptr, char ***argvptr,
* by enclosing them in quotes. Check for any multi-word
* mca params passed to mpirun and include them
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
int cnt, i;
cnt = opal_argv_count(orted_cmd_line);
for (i=0; i < cnt; i+=3) {
/* check if the specified option is more than one word - all
* others have already been passed
*/
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
/* must add quotes around it */
asprintf(&param, "\"%s\"", orted_cmd_line[i+2]);
/* now pass it along */
opal_argv_append(&argc, &argv, orted_cmd_line[i]);
opal_argv_append(&argc, &argv, orted_cmd_line[i+1]);
opal_argv_append(&argc, &argv, param);
cnt = opal_argv_count(orted_cmd_line);
for (i=0; i < cnt; i+=3) {
/* check if the specified option is more than one word - all
* others have already been passed
*/
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
/* must add quotes around it */
asprintf(&param, "\"%s\"", orted_cmd_line[i+2]);
/* now pass it along */
opal_argv_append(&argc, &argv, orted_cmd_line[i]);
opal_argv_append(&argc, &argv, orted_cmd_line[i+1]);
opal_argv_append(&argc, &argv, param);
free(param);
}
}
/* unless told otherwise... */
if (mca_plm_rsh_component.pass_environ_mca_params) {
/* now check our local environment for MCA params - add them
* only if they aren't already present
*/
for (i = 0; NULL != environ[i]; ++i) {
if (0 == strncmp("OMPI_", environ[i], 5)) {
/* check for duplicate in app->env - this
* would have been placed there by the
* cmd line processor. By convention, we
* always let the cmd line override the
* environment
*/
param = strdup(&environ[i][9]);
value = strchr(param, '=');
*value = '\0';
value++;
/* see if this param exists on the cmd line */
for (j=0; NULL != argv[j]; j++) {
if (0 == strcmp(param, argv[j])) {
found = true;
break;
}
}
if (!found) {
/* add it */
opal_argv_append(&argc, &argv, "-mca");
opal_argv_append(&argc, &argv, param);
opal_argv_append(&argc, &argv, value);
}
free(param);
}
}
}
value = opal_argv_join(argv, ' ');
if (ARG_MAX < strlen(value)) {
orte_show_help("help-plm-rsh.txt", "cmd-line-too-long",
true, strlen(value), ARG_MAX);
free(value);
return ORTE_ERR_SILENT;
}
free(value);
if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
opal_argv_append(&argc, &argv, ")");
@ -938,7 +986,7 @@ static int rsh_launch(orte_job_t *jdata)
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* if we are tree launching, find our children and create the launch cmd */
if (mca_plm_rsh_component.tree_spawn) {
orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN;