From a62b2a0150e3fc590de7ee0e450ccc30feab19e0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 30 Jul 2008 18:26:24 +0000 Subject: [PATCH] Per the July technical meeting: Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways. Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument. For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work. This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do. To solve this problem, we did the following: 1. removed all mca params from the individual plms for the launch agent 2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted". 3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values 4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry. 5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct. This commit was SVN r19097. --- opal/util/argv.c | 56 ++++ opal/util/argv.h | 12 +- orte/mca/plm/alps/plm_alps.h | 1 - orte/mca/plm/alps/plm_alps_component.c | 9 - orte/mca/plm/alps/plm_alps_module.c | 2 +- orte/mca/plm/base/plm_base_launch_support.c | 21 ++ orte/mca/plm/base/plm_private.h | 2 + orte/mca/plm/ccp/plm_ccp.h | 1 - orte/mca/plm/ccp/plm_ccp_component.c | 4 - orte/mca/plm/ccp/plm_ccp_module.c | 15 +- orte/mca/plm/lsf/plm_lsf.h | 2 - orte/mca/plm/lsf/plm_lsf_component.c | 12 +- orte/mca/plm/lsf/plm_lsf_module.c | 2 +- orte/mca/plm/rsh/plm_rsh_module.c | 269 ++++++++++++-------- orte/mca/plm/slurm/plm_slurm.h | 26 +- orte/mca/plm/slurm/plm_slurm_component.c | 17 +- orte/mca/plm/slurm/plm_slurm_module.c | 8 +- orte/mca/plm/tm/plm_tm.h | 1 - orte/mca/plm/tm/plm_tm_component.c | 4 - orte/mca/plm/tm/plm_tm_module.c | 5 +- orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + orte/runtime/orte_mca_params.c | 5 + orte/tools/orterun/orterun.c | 5 + 24 files changed, 287 insertions(+), 194 deletions(-) diff --git a/opal/util/argv.c b/opal/util/argv.c index a5a024a5e1..817ab7cad1 100644 --- a/opal/util/argv.c +++ b/opal/util/argv.c @@ -269,6 +269,62 @@ char *opal_argv_join(char **argv, int delimiter) } +/* + * Join all the elements of an argv array from within a + * specified range into a single newly-allocated string. + */ +char *opal_argv_join_range(char **argv, size_t start, size_t end, int delimiter) +{ + char **p; + char *pp; + char *str; + size_t str_len = 0; + size_t i; + + /* Bozo case */ + + if (NULL == argv || NULL == argv[0] || (int)start > opal_argv_count(argv)) { + return strdup(""); + } + + /* Find the total string length in argv including delimiters. The + last delimiter is replaced by the NULL character. */ + + for (p = &argv[start], i=start; *p && i < end; ++p, ++i) { + str_len += strlen(*p) + 1; + } + + /* Allocate the string. */ + + if (NULL == (str = (char*) malloc(str_len))) + return NULL; + + /* Loop filling in the string. */ + + str[--str_len] = '\0'; + p = &argv[start]; + pp = *p; + + for (i = 0; i < str_len; ++i) { + if ('\0' == *pp) { + + /* End of a string, fill in a delimiter and go to the next + string. */ + + str[i] = (char) delimiter; + ++p; + pp = *p; + } else { + str[i] = *pp++; + } + } + + /* All done */ + + return str; +} + + /* * Return the number of bytes consumed by an argv array. */ diff --git a/opal/util/argv.h b/opal/util/argv.h index 899bc88f0c..3737d00bb6 100644 --- a/opal/util/argv.h +++ b/opal/util/argv.h @@ -35,9 +35,8 @@ #include #endif -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif +BEGIN_C_DECLS + /** * Append a string (by value) to an new or existing NULL-terminated * argv array. @@ -165,6 +164,8 @@ OPAL_DECLSPEC int opal_argv_count(char **argv); */ OPAL_DECLSPEC char *opal_argv_join(char **argv, int delimiter) __opal_attribute_malloc__; +OPAL_DECLSPEC char *opal_argv_join_range(char **argv, size_t start, size_t end, int delimiter) __opal_attribute_malloc__; + /** * Return the number of bytes consumed by an argv array. * @@ -240,8 +241,7 @@ OPAL_DECLSPEC int opal_argv_delete(int *argc, char ***argv, * target). */ OPAL_DECLSPEC int opal_argv_insert(char ***target, int start, char **source); -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif + +END_C_DECLS #endif /* OPAL_ARGV_H */ diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index eea32f6600..d42b4f9920 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -31,7 +31,6 @@ struct orte_plm_alps_component_t { int priority; int debug; bool timing; - char *orted; char *custom_args; }; typedef struct orte_plm_alps_component_t orte_plm_alps_component_t; diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index 405433e74d..3b3e2b63cb 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -103,11 +103,6 @@ static int plm_alps_open(void) false, false, 75, &mca_plm_alps_component.priority); - mca_base_param_reg_string(comp, "orted", - "Command to use to start proxy orted", - false, false, "orted", - &mca_plm_alps_component.orted); - mca_plm_alps_component.timing = orte_timing; mca_base_param_reg_string(comp, "args", @@ -129,10 +124,6 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori static int plm_alps_close(void) { - if (NULL != mca_plm_alps_component.orted) { - free(mca_plm_alps_component.orted); - } - if (NULL != mca_plm_alps_component.custom_args) { free(mca_plm_alps_component.custom_args); } diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index c832347191..84eefcad30 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -266,7 +266,7 @@ static int plm_alps_launch_job(orte_job_t *jdata) */ /* add the daemon command (as specified by user) */ - opal_argv_append(&argc, &argv, mca_plm_alps_component.orted); + orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 863a073c95..e28f718379 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -700,6 +700,27 @@ static int orte_plm_base_report_launched(orte_jobid_t job) return ORTE_SUCCESS; } +int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) +{ + int i, loc; + char **tmpv; + + /* set default location */ + loc = -1; + /* split the command apart in case it is multi-word */ + tmpv = opal_argv_split(orte_launch_agent, ' '); + for (i = 0; NULL != tmpv && NULL != tmpv[i]; ++i) { + if (0 == strcmp(tmpv[i], "orted")) { + loc = i; + } + opal_argv_append(argc, argv, tmpv[i]); + } + opal_argv_free(tmpv); + + return loc; +} + + int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess, int *proc_vpid_index, diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index f544b6ac3a..59a6bfe748 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -75,6 +75,8 @@ ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_jobid_t *jobid); +ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv); + /** * Heartbeat support */ diff --git a/orte/mca/plm/ccp/plm_ccp.h b/orte/mca/plm/ccp/plm_ccp.h index 490eed02a2..4d8e29aa29 100644 --- a/orte/mca/plm/ccp/plm_ccp.h +++ b/orte/mca/plm/ccp/plm_ccp.h @@ -29,7 +29,6 @@ struct orte_plm_ccp_component_t { int debug; int verbose; bool want_path_check; - char *orted; char **checked_paths; bool timing; }; diff --git a/orte/mca/plm/ccp/plm_ccp_component.c b/orte/mca/plm/ccp/plm_ccp_component.c index 37df23b4ab..1628ef3794 100644 --- a/orte/mca/plm/ccp/plm_ccp_component.c +++ b/orte/mca/plm/ccp/plm_ccp_component.c @@ -97,10 +97,6 @@ static int plm_ccp_open(void) mca_base_param_reg_int(comp, "priority", "Default selection priority", false, false, 75, &mca_plm_ccp_component.priority); - mca_base_param_reg_string(comp, "orted", - "Command to use to start proxy orted", - false, false, "orted", - &mca_plm_ccp_component.orted); mca_base_param_reg_int(comp, "want_path_check", "Whether the launching process should check for the plm_ccp_orted executable in the PATH before launching (the CCP API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)", false, false, (int) true, &tmp); diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index bc49f90a96..674a24a4b5 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -95,11 +95,11 @@ orte_plm_base_module_t orte_plm_ccp_module = { plm_ccp_finalize }; - -/* - * Local variables - */ -static orte_jobid_t active_job = ORTE_JOBID_INVALID; + +/* + * Local variables + */ +static orte_jobid_t active_job = ORTE_JOBID_INVALID; /** @@ -209,8 +209,9 @@ GETMAP: } /* add the daemon command (as specified by user) */ - argv = opal_argv_split(mca_plm_ccp_component.orted, ' '); - argc = opal_argv_count(argv); + argc = 0; + argv = NULL; + orte_plm_base_setup_orted_cmd(&argc, &argv); opal_argv_append(&argc, &argv, "--no-daemonize"); diff --git a/orte/mca/plm/lsf/plm_lsf.h b/orte/mca/plm/lsf/plm_lsf.h index 6d83e0e959..d4791d19dd 100644 --- a/orte/mca/plm/lsf/plm_lsf.h +++ b/orte/mca/plm/lsf/plm_lsf.h @@ -30,9 +30,7 @@ BEGIN_C_DECLS struct orte_plm_lsf_component_t { orte_plm_base_component_t super; - int priority; bool timing; - char *orted; }; typedef struct orte_plm_lsf_component_t orte_plm_lsf_component_t; diff --git a/orte/mca/plm/lsf/plm_lsf_component.c b/orte/mca/plm/lsf/plm_lsf_component.c index d8a79c7dba..1dbcb5dbab 100644 --- a/orte/mca/plm/lsf/plm_lsf_component.c +++ b/orte/mca/plm/lsf/plm_lsf_component.c @@ -93,16 +93,6 @@ orte_plm_lsf_component_t mca_plm_lsf_component = { static int plm_lsf_open(void) { - mca_base_component_t *comp = &mca_plm_lsf_component.super.base_version; - - mca_base_param_reg_int(comp, "priority", "Default selection priority", - false, false, 75, &mca_plm_lsf_component.priority); - - mca_base_param_reg_string(comp, "orted", - "Command to use to start proxy orted", - false, false, "orted", - &mca_plm_lsf_component.orted); - return ORTE_SUCCESS; } @@ -125,7 +115,7 @@ static int orte_plm_lsf_component_query(mca_base_module_t **module, int *priorit return ORTE_ERROR; } - *priority = mca_plm_lsf_component.priority; + *priority = 75; *module = (mca_base_module_t *) &orte_plm_lsf_module; return ORTE_SUCCESS; } diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index 2b10239509..348544001d 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -222,7 +222,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata) */ /* add the daemon command (as specified by user) */ - opal_argv_append(&argc, &argv, mca_plm_lsf_component.orted); + orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 6ccf3fe7e2..3887bfbd1d 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -343,8 +343,7 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata) static int setup_launch(int *argcptr, char ***argvptr, char *nodename, int *node_name_index1, int *node_name_index2, - int *local_exec_index, - int *proc_vpid_index, char **lib_base, char **bin_base, + int *proc_vpid_index, char *prefix_dir, bool *remote_sh, bool *remote_csh) { struct passwd *p; @@ -353,6 +352,11 @@ static int setup_launch(int *argcptr, char ***argvptr, char *param; orte_plm_rsh_shell_t shell; bool local_sh = false, local_csh = false; + char *lib_base, *bin_base; + int orted_argc; + char **orted_argv; + char *orted_cmd, *orted_prefix, *final_cmd; + int orted_index; int rc; /* What is our local shell? */ @@ -428,6 +432,36 @@ static int setup_launch(int *argcptr, char ***argvptr, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *remote_csh, *remote_sh)); + /* Figure out the basenames for the libdir and bindir. This + requires some explanation: + + - Use opal_install_dirs.libdir and opal_install_dirs.bindir. + + - After a discussion on the devel-core mailing list, the + developers decided that we should use the local directory + basenames as the basis for the prefix on the remote note. + This does not handle a few notable cases (e.g., if the + libdir/bindir is not simply a subdir under the prefix, if the + libdir/bindir basename is not the same on the remote node as + it is here on the local node, etc.), but we decided that + --prefix was meant to handle "the common case". If you need + something more complex than this, a) edit your shell startup + files to set PATH/LD_LIBRARY_PATH properly on the remove + node, or b) use some new/to-be-defined options that + explicitly allow setting the bindir/libdir on the remote + node. We decided to implement these options (e.g., + --remote-bindir and --remote-libdir) to orterun when it + actually becomes a problem for someone (vs. a hypothetical + situation). + + Hence, for now, we simply take the basename of this install's + libdir and bindir and use it to append this install's prefix + and use that on the remote node. + */ + + lib_base = opal_basename(opal_install_dirs.libdir); + bin_base = opal_basename(opal_install_dirs.bindir); + /* * Build argv array */ @@ -436,9 +470,127 @@ static int setup_launch(int *argcptr, char ***argvptr, *node_name_index1 = argc; opal_argv_append(&argc, &argv, "