1
1

Cleanup orted cmd line - we don't need to pass nodenames, and shouldn't pass heartbeat unless the orted is going to use it. This helps shorten the cmd line for future use.

Cleanup when an orted actually opens the PLM. Unfortunately, some unmentionable people are pushing head node environs out to remote nodes, causing the daemons to think they are the HNP. This helps prevent the confusion.

This commit was SVN r19518.
Этот коммит содержится в:
Ralph Castain 2008-09-08 15:45:11 +00:00
родитель 04ee20a880
Коммит 9b8473fdbf
14 изменённых файлов: 67 добавлений и 67 удалений

Просмотреть файл

@ -58,24 +58,39 @@
#include "orte/mca/ess/base/base.h"
static bool plm_in_use;
int orte_ess_base_orted_setup(void)
{
int ret;
char *error = NULL;
char *plm_to_use;
/* some environments allow remote launches - e.g., ssh - so
* open the PLM and select something
* open the PLM and select something -only- if we are given
* a specific module to use
*/
if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
mca_base_param_reg_string_name("plm", NULL,
"Which plm component to use (empty = none)",
false, false,
NULL, &plm_to_use);
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_select";
goto error;
if (NULL == plm_to_use) {
plm_in_use = false;
} else {
plm_in_use = true;
if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_select";
goto error;
}
}
/* Setup the communication infrastructure */
@ -135,6 +150,21 @@ int orte_ess_base_orted_setup(void)
goto error;
}
/* Now provide a chance for the PLM
* to perform any module-specific init functions. This
* needs to occur AFTER the communications are setup
* as it may involve starting a non-blocking recv
* Do this only if a specific PLM was given to us - the
* orted has no need of the proxy PLM at all
*/
if (plm_in_use) {
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_init";
goto error;
}
}
/* setup my session directory */
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
@ -275,7 +305,9 @@ int orte_ess_base_orted_finalize(void)
orte_iof_base_close();
/* finalize selected modules */
orte_plm_base_close();
if (plm_in_use) {
orte_plm_base_close();
}
orte_errmgr_base_close();
/* now can close the rml and its friendly group comm */

Просмотреть файл

@ -272,7 +272,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"alps",
&proc_vpid_index,
NULL, false);
false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -731,7 +731,6 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *ess,
int *proc_vpid_index,
int *node_name_index,
bool heartbeat)
{
char *param = NULL;
@ -769,7 +768,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, param);
free(param);
}
if (heartbeat) {
if (heartbeat && 0 < orte_heartbeat_rate) {
/* tell the daemon to do a heartbeat */
opal_argv_append(argc, argv, "--heartbeat");
asprintf(&param, "%d", orte_heartbeat_rate);
@ -821,13 +820,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, param);
free(param);
/* Node Name */
if(NULL != node_name_index) {
opal_argv_append(argc, argv, "--nodename");
*node_name_index = *argc;
opal_argv_append(argc, argv, "<template>");
}
/* pass along any cmd line MCA params provided to mpirun,
* being sure to "purge" any that would cause problems
* on backend nodes
@ -839,11 +831,22 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
* have a generic way of passing it as some environments ignore
* any quotes we add, while others don't - so we ignore any
* such options. In most cases, this won't be a problem as
* they typically only apply to things of interest to the HNP
* they typically only apply to things of interest to the HNP.
* Individual environments can add these back into the cmd line
* as they know if it can be supported
*/
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
continue;
}
/* The daemon will attempt to open the PLM on the remote
* end. Only a few environments allow this, so the daemon
* only opens the PLM -if- it is specifically told to do
* so by giving it a specific PLM module. To ensure we avoid
* confusion, do not include any directives here
*/
if (0 == strcmp(orted_cmd_line[i+1], "plm")) {
continue;
}
/* must be okay - pass it along */
opal_argv_append(argc, argv, orted_cmd_line[i]);
opal_argv_append(argc, argv, orted_cmd_line[i+1]);

Просмотреть файл

@ -71,7 +71,7 @@ int orte_plm_base_select(void)
orte_plm = *best_module;
orte_plm_base.selected_component = *best_component;
orte_plm_base.selected = true;
cleanup:
return exit_status;
}

Просмотреть файл

@ -106,7 +106,6 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender,
ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *sds,
int *proc_vpid_index,
int *node_name_index,
bool heartbeat);
/*

Просмотреть файл

@ -130,7 +130,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
orte_std_cntr_t launched = 0, i;
orte_job_map_t *map = NULL;
int argc, rc, node_name_index, proc_vpid_index, proc_name_index;
int argc, rc, proc_vpid_index, proc_name_index;
char *param, **env = NULL, *var, **argv = NULL;
bool connected = false;
char *bin_base = NULL, *lib_base = NULL, *command_line;
@ -222,7 +222,7 @@ GETMAP:
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index, false);
false);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
@ -383,10 +383,6 @@ GETMAP:
continue;
}
/* setup node name */
free(argv[node_name_index]);
argv[node_name_index] = strdup(node->name);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:ccp: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -228,7 +228,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"lsf",
&proc_vpid_index,
NULL, false);
false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -447,7 +447,6 @@ static void orte_plm_process_wait_daemon(pid_t pid, int status, void* cbdata)
int orte_plm_process_launch(orte_job_t *jdata)
{
orte_job_map_t *map = NULL;
int node_name_index2;
int proc_vpid_index;
int local_exec_index;
char *vpid_string = NULL;
@ -564,7 +563,7 @@ int orte_plm_process_launch(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
&proc_vpid_index,
&node_name_index2, false);
false);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
@ -637,10 +636,6 @@ int orte_plm_process_launch(orte_job_t *jdata)
return ORTE_ERR_FATAL;
}
/* setup node name */
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(nodes[nnode]->name);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,

Просмотреть файл

@ -341,7 +341,7 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
static int setup_launch(int *argcptr, char ***argvptr,
char *nodename,
int *node_name_index1, int *node_name_index2,
int *node_name_index1,
int *proc_vpid_index, char *prefix_dir,
bool *remote_sh, bool *remote_csh)
{
@ -610,7 +610,6 @@ static int setup_launch(int *argcptr, char ***argvptr,
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
proc_vpid_index,
node_name_index2,
true);
/* in the rsh environment, we can append multi-word arguments
@ -778,7 +777,6 @@ static int remote_spawn(opal_buffer_t *launch)
orte_vpid_t vpid;
orte_nid_t **nodes;
int node_name_index1;
int node_name_index2;
int proc_vpid_index;
char **argv = NULL;
char *prefix;
@ -828,7 +826,7 @@ static int remote_spawn(opal_buffer_t *launch)
}
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1, &node_name_index2,
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
&proc_vpid_index, prefix, &remote_sh, &remote_csh))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -855,9 +853,6 @@ static int remote_spawn(opal_buffer_t *launch)
free(argv[node_name_index1]);
argv[node_name_index1] = strdup(nodes[vpid]->name);
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(nodes[vpid]->name);
/* fork a child to exec the rsh/ssh session */
pid = fork();
if (pid < 0) {
@ -934,7 +929,6 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
{
orte_job_map_t *map = NULL;
int node_name_index1;
int node_name_index2;
int proc_vpid_index;
char **argv = NULL;
char *prefix_dir;
@ -1038,7 +1032,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
prefix_dir = apps[0]->prefix_dir;
/* setup the launch */
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, nodes[0]->name, &node_name_index1, &node_name_index2,
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, nodes[0]->name, &node_name_index1,
&proc_vpid_index, prefix_dir, &remote_sh, &remote_csh))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -1148,9 +1142,6 @@ launch:
argv[node_name_index1] = strdup(nodes[nnode]->name);
}
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(nodes[nnode]->name);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -280,7 +280,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"slurm",
&proc_vpid_index,
NULL, false);
false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -341,7 +341,6 @@ int orte_plm_submit_launch(orte_job_t *jdata)
orte_job_map_t *map;
orte_std_cntr_t num_nodes;
int node_name_index1;
int node_name_index2;
int proc_vpid_index;
int local_exec_index, local_exec_index_end;
char *vpid_string = NULL;
@ -545,8 +544,7 @@ int orte_plm_submit_launch(orte_job_t *jdata)
*/
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
&proc_vpid_index,
&node_name_index2);
&proc_vpid_index);
local_exec_index_end = argc;
if (mca_plm_submit_component.debug) {
@ -612,9 +610,6 @@ int orte_plm_submit_launch(orte_job_t *jdata)
argv[node_name_index1] = strdup(nodes[nnode]->name);
}
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(nodes[nnode]->name);
/* fork a child to exec the submit/ssh session */
pid = fork();

Просмотреть файл

@ -130,7 +130,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
orte_job_map_t *map = NULL;
orte_app_context_t **apps;
orte_node_t **nodes;
int node_name_index;
int proc_vpid_index;
char *param;
char **env = NULL;
@ -204,7 +203,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index,
true);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
@ -289,10 +287,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
continue;
}
/* setup node name */
free(argv[node_name_index]);
argv[node_name_index] = strdup(node->name);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -438,7 +438,6 @@ cleanup:
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
NULL,
NULL,
true);
/* Note that capacity is a starting capacity, not max */

Просмотреть файл

@ -167,10 +167,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
"Direct the orted to separate from the current session"},
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
&orte_process_info.nodename, OPAL_CMD_LINE_TYPE_STRING,
"Node name as specified by host/resource description." },
{ "tmpdir", "base", NULL, '\0', NULL, "tmpdir", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree" },