1
1

Per the July technical meeting:

Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways.

Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument.

For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work.

This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do.

To solve this problem, we did the following:

1. removed all mca params from the individual plms for the launch agent

2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted".

3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values

4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry.

5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices

Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct.

This commit was SVN r19097.
Этот коммит содержится в:
Ralph Castain 2008-07-30 18:26:24 +00:00
родитель 90a784dfca
Коммит a62b2a0150
24 изменённых файлов: 287 добавлений и 194 удалений

Просмотреть файл

@ -269,6 +269,62 @@ char *opal_argv_join(char **argv, int delimiter)
}
/*
* Join all the elements of an argv array from within a
* specified range into a single newly-allocated string.
*/
char *opal_argv_join_range(char **argv, size_t start, size_t end, int delimiter)
{
char **p;
char *pp;
char *str;
size_t str_len = 0;
size_t i;
/* Bozo case */
if (NULL == argv || NULL == argv[0] || (int)start > opal_argv_count(argv)) {
return strdup("");
}
/* Find the total string length in argv including delimiters. The
last delimiter is replaced by the NULL character. */
for (p = &argv[start], i=start; *p && i < end; ++p, ++i) {
str_len += strlen(*p) + 1;
}
/* Allocate the string. */
if (NULL == (str = (char*) malloc(str_len)))
return NULL;
/* Loop filling in the string. */
str[--str_len] = '\0';
p = &argv[start];
pp = *p;
for (i = 0; i < str_len; ++i) {
if ('\0' == *pp) {
/* End of a string, fill in a delimiter and go to the next
string. */
str[i] = (char) delimiter;
++p;
pp = *p;
} else {
str[i] = *pp++;
}
}
/* All done */
return str;
}
/*
* Return the number of bytes consumed by an argv array.
*/

Просмотреть файл

@ -35,9 +35,8 @@
#include <sys/types.h>
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
BEGIN_C_DECLS
/**
* Append a string (by value) to an new or existing NULL-terminated
* argv array.
@ -165,6 +164,8 @@ OPAL_DECLSPEC int opal_argv_count(char **argv);
*/
OPAL_DECLSPEC char *opal_argv_join(char **argv, int delimiter) __opal_attribute_malloc__;
OPAL_DECLSPEC char *opal_argv_join_range(char **argv, size_t start, size_t end, int delimiter) __opal_attribute_malloc__;
/**
* Return the number of bytes consumed by an argv array.
*
@ -240,8 +241,7 @@ OPAL_DECLSPEC int opal_argv_delete(int *argc, char ***argv,
* target).
*/
OPAL_DECLSPEC int opal_argv_insert(char ***target, int start, char **source);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
END_C_DECLS
#endif /* OPAL_ARGV_H */

Просмотреть файл

@ -31,7 +31,6 @@ struct orte_plm_alps_component_t {
int priority;
int debug;
bool timing;
char *orted;
char *custom_args;
};
typedef struct orte_plm_alps_component_t orte_plm_alps_component_t;

Просмотреть файл

@ -103,11 +103,6 @@ static int plm_alps_open(void)
false, false, 75,
&mca_plm_alps_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_plm_alps_component.orted);
mca_plm_alps_component.timing = orte_timing;
mca_base_param_reg_string(comp, "args",
@ -129,10 +124,6 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
static int plm_alps_close(void)
{
if (NULL != mca_plm_alps_component.orted) {
free(mca_plm_alps_component.orted);
}
if (NULL != mca_plm_alps_component.custom_args) {
free(mca_plm_alps_component.custom_args);
}

Просмотреть файл

@ -266,7 +266,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
*/
/* add the daemon command (as specified by user) */
opal_argv_append(&argc, &argv, mca_plm_alps_component.orted);
orte_plm_base_setup_orted_cmd(&argc, &argv);
/* Add basic orted command line options, including debug flags */
orte_plm_base_orted_append_basic_args(&argc, &argv,

Просмотреть файл

@ -700,6 +700,27 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
return ORTE_SUCCESS;
}
int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
{
int i, loc;
char **tmpv;
/* set default location */
loc = -1;
/* split the command apart in case it is multi-word */
tmpv = opal_argv_split(orte_launch_agent, ' ');
for (i = 0; NULL != tmpv && NULL != tmpv[i]; ++i) {
if (0 == strcmp(tmpv[i], "orted")) {
loc = i;
}
opal_argv_append(argc, argv, tmpv[i]);
}
opal_argv_free(tmpv);
return loc;
}
int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *ess,
int *proc_vpid_index,

Просмотреть файл

@ -75,6 +75,8 @@ ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_jobid_t *jobid);
ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv);
/**
* Heartbeat support
*/

Просмотреть файл

@ -29,7 +29,6 @@ struct orte_plm_ccp_component_t {
int debug;
int verbose;
bool want_path_check;
char *orted;
char **checked_paths;
bool timing;
};

Просмотреть файл

@ -97,10 +97,6 @@ static int plm_ccp_open(void)
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75, &mca_plm_ccp_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_plm_ccp_component.orted);
mca_base_param_reg_int(comp, "want_path_check",
"Whether the launching process should check for the plm_ccp_orted executable in the PATH before launching (the CCP API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
false, false, (int) true, &tmp);

Просмотреть файл

@ -95,11 +95,11 @@ orte_plm_base_module_t orte_plm_ccp_module = {
plm_ccp_finalize
};
/*
* Local variables
*/
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
/*
* Local variables
*/
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
/**
@ -209,8 +209,9 @@ GETMAP:
}
/* add the daemon command (as specified by user) */
argv = opal_argv_split(mca_plm_ccp_component.orted, ' ');
argc = opal_argv_count(argv);
argc = 0;
argv = NULL;
orte_plm_base_setup_orted_cmd(&argc, &argv);
opal_argv_append(&argc, &argv, "--no-daemonize");

Просмотреть файл

@ -30,9 +30,7 @@ BEGIN_C_DECLS
struct orte_plm_lsf_component_t {
orte_plm_base_component_t super;
int priority;
bool timing;
char *orted;
};
typedef struct orte_plm_lsf_component_t orte_plm_lsf_component_t;

Просмотреть файл

@ -93,16 +93,6 @@ orte_plm_lsf_component_t mca_plm_lsf_component = {
static int plm_lsf_open(void)
{
mca_base_component_t *comp = &mca_plm_lsf_component.super.base_version;
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75, &mca_plm_lsf_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_plm_lsf_component.orted);
return ORTE_SUCCESS;
}
@ -125,7 +115,7 @@ static int orte_plm_lsf_component_query(mca_base_module_t **module, int *priorit
return ORTE_ERROR;
}
*priority = mca_plm_lsf_component.priority;
*priority = 75;
*module = (mca_base_module_t *) &orte_plm_lsf_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -222,7 +222,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
*/
/* add the daemon command (as specified by user) */
opal_argv_append(&argc, &argv, mca_plm_lsf_component.orted);
orte_plm_base_setup_orted_cmd(&argc, &argv);
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv,

Просмотреть файл

@ -343,8 +343,7 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
static int setup_launch(int *argcptr, char ***argvptr,
char *nodename,
int *node_name_index1, int *node_name_index2,
int *local_exec_index,
int *proc_vpid_index, char **lib_base, char **bin_base,
int *proc_vpid_index, char *prefix_dir,
bool *remote_sh, bool *remote_csh)
{
struct passwd *p;
@ -353,6 +352,11 @@ static int setup_launch(int *argcptr, char ***argvptr,
char *param;
orte_plm_rsh_shell_t shell;
bool local_sh = false, local_csh = false;
char *lib_base, *bin_base;
int orted_argc;
char **orted_argv;
char *orted_cmd, *orted_prefix, *final_cmd;
int orted_index;
int rc;
/* What is our local shell? */
@ -428,6 +432,36 @@ static int setup_launch(int *argcptr, char ***argvptr,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
*remote_csh, *remote_sh));
/* Figure out the basenames for the libdir and bindir. This
requires some explanation:
- Use opal_install_dirs.libdir and opal_install_dirs.bindir.
- After a discussion on the devel-core mailing list, the
developers decided that we should use the local directory
basenames as the basis for the prefix on the remote note.
This does not handle a few notable cases (e.g., if the
libdir/bindir is not simply a subdir under the prefix, if the
libdir/bindir basename is not the same on the remote node as
it is here on the local node, etc.), but we decided that
--prefix was meant to handle "the common case". If you need
something more complex than this, a) edit your shell startup
files to set PATH/LD_LIBRARY_PATH properly on the remove
node, or b) use some new/to-be-defined options that
explicitly allow setting the bindir/libdir on the remote
node. We decided to implement these options (e.g.,
--remote-bindir and --remote-libdir) to orterun when it
actually becomes a problem for someone (vs. a hypothetical
situation).
Hence, for now, we simply take the basename of this install's
libdir and bindir and use it to append this install's prefix
and use that on the remote node.
*/
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/*
* Build argv array
*/
@ -436,9 +470,127 @@ static int setup_launch(int *argcptr, char ***argvptr,
*node_name_index1 = argc;
opal_argv_append(&argc, &argv, "<template>");
/* add the daemon command (as specified by user) */
*local_exec_index = argc;
opal_argv_append(&argc, &argv, mca_plm_rsh_component.orted);
/* now get the orted cmd - as specified by user - into our tmp array.
* The function returns the location where the actual orted command is
* located - usually in the final spot, but someone could
* have added options. For example, it should be legal for them to use
* "orted --debug-devel" so they get debug output from the orteds, but
* not from mpirun. Also, they may have a customized version of orted
* that takes arguments in addition to the std ones we already support
*/
orted_argc = 0;
orted_argv = NULL;
orted_index = orte_plm_base_setup_orted_cmd(&orted_argc, &orted_argv);
/* look at the returned orted cmd argv to check several cases:
*
* - only "orted" was given. This is the default and thus most common
* case. In this situation, there is nothing we need to do
*
* - something was given that doesn't include "orted" - i.e., someone
* has substituted their own daemon. There isn't anything we can
* do here, so we want to avoid adding prefixes to the cmd
*
* - something was given that precedes "orted". For example, someone
* may have specified "valgrind [options] orted". In this case, we
* need to separate out that "orted_prefix" section so it can be
* treated separately below
*
* - something was given that follows "orted". An example was given above.
* In this case, we need to construct the effective "orted_cmd" so it
* can be treated properly below
*
* Obviously, the latter two cases can be combined - just to make it
* even more interesting! Gotta love rsh/ssh...
*/
if (0 == orted_index) {
/* this is the default scenario, but there could be options specified
* so we need to account for that possibility
*/
orted_cmd = opal_argv_join(orted_argv, ' ');
orted_prefix = NULL;
} else if (0 > orted_index) {
/* no "orted" was included */
orted_cmd = NULL;
orted_prefix = opal_argv_join(orted_argv, ' ');
} else {
/* okay, so the "orted" cmd is somewhere in this array, with
* something preceding it and perhaps things following it.
*/
orted_prefix = opal_argv_join_range(orted_argv, 0, orted_index, ' ');
orted_cmd = opal_argv_join_range(orted_argv, orted_index, opal_argv_count(orted_argv), ' ');
}
opal_argv_free(orted_argv); /* done with this */
/* we now need to assemble the actual cmd that will be executed - this depends
* upon whether or not a prefix directory is being used
*/
if (NULL != prefix_dir) {
/* if we have a prefix directory, we need to set the PATH and
* LD_LIBRARY_PATH on the remote node, and prepend just the orted_cmd
* with the prefix directory
*/
char *opal_prefix = getenv("OPAL_PREFIX");
if (remote_sh) {
/* if there is nothing preceding orted, then we can just
* assemble the cmd with the orted_cmd at the end. Otherwise,
* we have to insert the orted_prefix in the right place
*/
asprintf (&final_cmd,
"%s%s%s PATH=%s/%s:$PATH ; export PATH ; "
"LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
"%s %s/%s/%s",
(opal_prefix != NULL ? "OPAL_PREFIX=" : ""),
(opal_prefix != NULL ? opal_prefix : ""),
(opal_prefix != NULL ? " ;" : ""),
prefix_dir, bin_base,
prefix_dir, lib_base,
(orted_prefix != NULL ? orted_prefix : ""),
prefix_dir, bin_base,
orted_cmd);
} else if (remote_csh) {
/* [t]csh is a bit more challenging -- we
have to check whether LD_LIBRARY_PATH
is already set before we try to set it.
Must be very careful about obeying
[t]csh's order of evaluation and not
using a variable before it is defined.
See this thread for more details:
http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
/* if there is nothing preceding orted, then we can just
* assemble the cmd with the orted_cmd at the end. Otherwise,
* we have to insert the orted_prefix in the right place
*/
asprintf (&final_cmd,
"%s%s%s set path = ( %s/%s $path ) ; "
"if ( $?LD_LIBRARY_PATH == 1 ) "
"set OMPI_have_llp ; "
"if ( $?LD_LIBRARY_PATH == 0 ) "
"setenv LD_LIBRARY_PATH %s/%s ; "
"if ( $?OMPI_have_llp == 1 ) "
"setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; "
"%s %s/%s/%s",
(opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""),
(opal_prefix != NULL ? opal_prefix : ""),
(opal_prefix != NULL ? " ;" : ""),
prefix_dir, bin_base,
prefix_dir, lib_base,
prefix_dir, lib_base,
(orted_prefix != NULL ? orted_prefix : ""),
prefix_dir, bin_base,
orted_cmd);
}
} else {
/* no prefix directory, so just aggregate the result */
asprintf(&final_cmd, "%s %s",
(orted_prefix != NULL ? orted_prefix : ""),
(orted_cmd != NULL ? orted_cmd : ""));
}
/* now add the final cmd to the argv array */
opal_argv_append(&argc, &argv, final_cmd);
free(final_cmd); /* done with this */
if (NULL != orted_prefix) free(orted_prefix);
if (NULL != orted_cmd) free(orted_cmd);
/* if we are not tree launching or debugging, tell the daemon
* to daemonize so we can launch the next group
@ -492,36 +644,6 @@ static int setup_launch(int *argcptr, char ***argvptr,
if (NULL != param) free(param);
}
/* Figure out the basenames for the libdir and bindir. This
requires some explanation:
- Use opal_install_dirs.libdir and opal_install_dirs.bindir.
- After a discussion on the devel-core mailing list, the
developers decided that we should use the local directory
basenames as the basis for the prefix on the remote note.
This does not handle a few notable cases (e.g., if the
libdir/bindir is not simply a subdir under the prefix, if the
libdir/bindir basename is not the same on the remote node as
it is here on the local node, etc.), but we decided that
--prefix was meant to handle "the common case". If you need
something more complex than this, a) edit your shell startup
files to set PATH/LD_LIBRARY_PATH properly on the remove
node, or b) use some new/to-be-defined options that
explicitly allow setting the bindir/libdir on the remote
node. We decided to implement these options (e.g.,
--remote-bindir and --remote-libdir) to orterun when it
actually becomes a problem for someone (vs. a hypothetical
situation).
Hence, for now, we simply take the basename of this install's
libdir and bindir and use it to append this install's prefix
and use that on the remote node.
*/
*lib_base = opal_basename(opal_install_dirs.libdir);
*bin_base = opal_basename(opal_install_dirs.bindir);
/* all done */
*argcptr = argc;
*argvptr = argv;
@ -531,8 +653,6 @@ static int setup_launch(int *argcptr, char ***argvptr,
/* actually ssh the child */
static void ssh_child(int argc, char **argv,
orte_vpid_t vpid, int proc_vpid_index,
int local_exec_index, char *prefix_dir,
char *bin_base, char *lib_base,
bool remote_sh, bool remote_csh)
{
char** env;
@ -566,49 +686,6 @@ static void ssh_child(int argc, char **argv,
exec_argv = argv;
exec_path = strdup(mca_plm_rsh_component.agent_path);
if (NULL != prefix_dir) {
char *opal_prefix = getenv("OPAL_PREFIX");
if (remote_sh) {
asprintf (&argv[local_exec_index],
"%s%s%s PATH=%s/%s:$PATH ; export PATH ; "
"LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
"%s/%s/%s",
(opal_prefix != NULL ? "OPAL_PREFIX=" : ""),
(opal_prefix != NULL ? opal_prefix : ""),
(opal_prefix != NULL ? " ;" : ""),
prefix_dir, bin_base,
prefix_dir, lib_base,
prefix_dir, bin_base,
mca_plm_rsh_component.orted);
} else if (remote_csh) {
/* [t]csh is a bit more challenging -- we
have to check whether LD_LIBRARY_PATH
is already set before we try to set it.
Must be very careful about obeying
[t]csh's order of evaluation and not
using a variable before it is defined.
See this thread for more details:
http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
asprintf (&argv[local_exec_index],
"%s%s%s set path = ( %s/%s $path ) ; "
"if ( $?LD_LIBRARY_PATH == 1 ) "
"set OMPI_have_llp ; "
"if ( $?LD_LIBRARY_PATH == 0 ) "
"setenv LD_LIBRARY_PATH %s/%s ; "
"if ( $?OMPI_have_llp == 1 ) "
"setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; "
"%s/%s/%s",
(opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""),
(opal_prefix != NULL ? opal_prefix : ""),
(opal_prefix != NULL ? " ;" : ""),
prefix_dir, bin_base,
prefix_dir, lib_base,
prefix_dir, lib_base,
prefix_dir, bin_base,
mca_plm_rsh_component.orted);
}
}
/* pass the vpid */
rc = orte_util_convert_vpid_to_string(&var, vpid);
if (ORTE_SUCCESS != rc) {
@ -702,13 +779,11 @@ static int remote_spawn(opal_buffer_t *launch)
int node_name_index1;
int node_name_index2;
int proc_vpid_index;
int local_exec_index;
char **argv = NULL;
char *prefix;
int argc;
int rc;
bool remote_sh = false, remote_csh = false;
char *lib_base = NULL, *bin_base = NULL;
bool failed_launch = true;
pid_t pid;
orte_std_cntr_t n;
@ -753,8 +828,7 @@ static int remote_spawn(opal_buffer_t *launch)
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1, &node_name_index2,
&local_exec_index, &proc_vpid_index, &lib_base, &bin_base,
&remote_sh, &remote_csh))) {
&proc_vpid_index, prefix, &remote_sh, &remote_csh))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -800,8 +874,7 @@ static int remote_spawn(opal_buffer_t *launch)
/* do the ssh launch - this will exit if it fails */
ssh_child(argc, argv, vpid,
proc_vpid_index, local_exec_index, prefix, bin_base,
lib_base, remote_sh, remote_csh);
proc_vpid_index, remote_sh, remote_csh);
} else { /* father */
OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock);
@ -824,14 +897,7 @@ static int remote_spawn(opal_buffer_t *launch)
failed_launch = false;
cleanup:
if (NULL != lib_base) {
free(lib_base);
}
if (NULL != bin_base) {
free(bin_base);
}
cleanup:
if (NULL != argv) {
opal_argv_free(argv);
}
@ -869,13 +935,11 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
int node_name_index1;
int node_name_index2;
int proc_vpid_index;
int local_exec_index;
char **argv = NULL;
char *prefix_dir;
int argc;
int rc;
bool remote_sh = false, remote_csh = false;
char *lib_base = NULL, *bin_base = NULL;
bool failed_launch = true;
orte_app_context_t **apps;
orte_node_t **nodes;
@ -972,8 +1036,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, nodes[0]->name, &node_name_index1, &node_name_index2,
&local_exec_index, &proc_vpid_index, &lib_base, &bin_base,
&remote_sh, &remote_csh))) {
&proc_vpid_index, prefix_dir, &remote_sh, &remote_csh))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -1100,8 +1163,7 @@ launch:
/* do the ssh launch - this will exit if it fails */
ssh_child(argc, argv, nodes[nnode]->daemon->name.vpid,
proc_vpid_index, local_exec_index, prefix_dir, bin_base,
lib_base, remote_sh, remote_csh);
proc_vpid_index, remote_sh, remote_csh);
} else { /* father */
@ -1167,13 +1229,6 @@ launch_apps:
failed_launch = false;
cleanup:
if (NULL != lib_base) {
free(lib_base);
}
if (NULL != bin_base) {
free(bin_base);
}
if (NULL != argv) {
opal_argv_free(argv);
}

Просмотреть файл

@ -26,22 +26,18 @@
BEGIN_C_DECLS
struct orte_plm_slurm_component_t {
orte_plm_base_component_t super;
int priority;
char *orted;
char *custom_args;
};
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
struct orte_plm_slurm_component_t {
orte_plm_base_component_t super;
char *custom_args;
};
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
/*
* Globally exported variable
*/
ORTE_MODULE_DECLSPEC extern orte_plm_slurm_component_t
mca_plm_slurm_component;
ORTE_DECLSPEC extern orte_plm_base_module_t
orte_plm_slurm_module;
/*
* Globally exported variable
*/
ORTE_MODULE_DECLSPEC extern orte_plm_slurm_component_t mca_plm_slurm_component;
ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_slurm_module;
END_C_DECLS

Просмотреть файл

@ -91,15 +91,6 @@ static int plm_slurm_open(void)
{
mca_base_component_t *comp = &mca_plm_slurm_component.super.base_version;
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75,
&mca_plm_slurm_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_plm_slurm_component.orted);
mca_base_param_reg_string(comp, "args",
"Custom arguments to srun",
false, false, NULL,
@ -113,10 +104,10 @@ static int orte_plm_slurm_component_query(mca_base_module_t **module, int *prior
/* Are we running under a SLURM job? */
if (NULL != getenv("SLURM_JOBID")) {
*priority = mca_plm_slurm_component.priority;
*priority = 75;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slrum: available for selection",
"%s plm:slurm: available for selection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
*module = (mca_base_module_t *)&orte_plm_slurm_module;
@ -131,10 +122,6 @@ static int orte_plm_slurm_component_query(mca_base_module_t **module, int *prior
static int plm_slurm_close(void)
{
if (NULL != mca_plm_slurm_component.orted) {
free(mca_plm_slurm_component.orted);
}
if (NULL != mca_plm_slurm_component.custom_args) {
free(mca_plm_slurm_component.custom_args);
}

Просмотреть файл

@ -135,7 +135,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
char **argv = NULL;
int argc;
int rc;
char *tmp, **tmpv;
char *tmp;
char** env = NULL;
char* var;
char *nodelist_flat;
@ -274,11 +274,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
*/
/* add the daemon command (as specified by user) */
tmpv = opal_argv_split(mca_plm_slurm_component.orted, ' ');
for (i = 0; NULL != tmpv && NULL != tmpv[i]; ++i) {
opal_argv_append(&argc, &argv, tmpv[i]);
}
opal_argv_free(tmpv);
orte_plm_base_setup_orted_cmd(&argc, &argv);
/* Add basic orted command line options, including debug flags */
orte_plm_base_orted_append_basic_args(&argc, &argv,

Просмотреть файл

@ -30,7 +30,6 @@ BEGIN_C_DECLS
struct orte_plm_tm_component_t {
orte_plm_base_component_t super;
bool want_path_check;
char *orted;
char **checked_paths;
};
typedef struct orte_plm_tm_component_t orte_plm_tm_component_t;

Просмотреть файл

@ -92,10 +92,6 @@ static int plm_tm_open(void)
int tmp;
mca_base_component_t *comp = &mca_plm_tm_component.super.base_version;
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_plm_tm_component.orted);
mca_base_param_reg_int(comp, "want_path_check",
"Whether the launching process should check for the plm_tm_orted executable in the PATH before launching (the TM API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
false, false, (int) true, &tmp);

Просмотреть файл

@ -137,7 +137,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
char **env = NULL;
char *var;
char **argv = NULL;
int argc;
int argc = 0;
int rc;
bool connected = false;
orte_std_cntr_t launched = 0, i;
@ -199,8 +199,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
}
/* add the daemon command (as specified by user) */
argv = opal_argv_split(mca_plm_tm_component.orted, ' ');
argc = opal_argv_count(argv);
orte_plm_base_setup_orted_cmd(&argc, &argv);
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",

Просмотреть файл

@ -65,6 +65,7 @@ opal_pointer_array_t orte_daemonmap;
bool orte_hnp_is_allocated = false;
char *orte_launch_agent;
char **orted_cmd_line=NULL;
int orte_exit, orteds_exit;
int orte_exit_status = 0;

Просмотреть файл

@ -350,6 +350,7 @@ ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern int orte_exit, orteds_exit;
ORTE_DECLSPEC extern int orte_exit_status;

Просмотреть файл

@ -151,6 +151,11 @@ int orte_register_params(void)
false, false, (int) false, &value);
orte_hetero_apps = OPAL_INT_TO_BOOL(value);
/* allow specification of the launch agent */
mca_base_param_reg_string_name("orte", "launch_agent",
"Command used to start processes on remote nodes (default: orted)",
false, false, "orted", &orte_launch_agent);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;

Просмотреть файл

@ -147,6 +147,11 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Provide all output in XML format" },
/* Specify the launch agent to be used */
{ "orte", "launch", "agent", '\0', NULL, "launch-agent", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Command used to start processes on remote nodes (default: orted)" },
/* Preload the binary on the remote machine */
{ NULL, NULL, NULL, 's', NULL, "preload-binary", 0,
&orterun_globals.preload_binary, OPAL_CMD_LINE_TYPE_BOOL,