From 8b411a10bee2fea3ff59d34b0de851d14a2067b7 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 1 Jun 2005 19:30:05 +0000 Subject: [PATCH] - revert to resolving "localhost" to the contents of orte_system_info.nodename so that cleanup and the like occur correctly. Otherwise, the daemon on localhost and an MPI process can have different ideas on what the local nodename is, and that lead to all kinds of badness with both process killing and cleanup. Also fixes the annoying ssh keys problem when sshing to localhost. - modify the rsh pls to ssh to localhost if the target nodename is the same as orte_system_info.nodename AND is not resolvable (ie, ssh to would fail). Otherwise, ssh to nodename. This should work around the issues Ralph was seeing with ssh failing on his laptop (since the above change undid the previous fix to this problem). - Small change to ompi_ifislocal() to squelch a warning message about unresolvable hostnames when checking to see if a name is, in fact, resolvable. - Force ORTE process to have same nodename field as it's starting daemon (assuming it was started using the fork pls), so that the fork pls can properly kill the process, and cleanup its session directory on abnormal exit. This commit was SVN r5914. --- src/mca/pls/fork/pls_fork_module.c | 5 ++++ src/mca/pls/rsh/pls_rsh_module.c | 23 +++++++++++++++++- src/mca/rds/hostfile/rds_hostfile.c | 36 +++-------------------------- src/util/if.c | 7 ++++++ src/util/sys_info.c | 7 +++++- 5 files changed, 43 insertions(+), 35 deletions(-) diff --git a/src/mca/pls/fork/pls_fork_module.c b/src/mca/pls/fork/pls_fork_module.c index c4752c8e77..7967961aed 100644 --- a/src/mca/pls/fork/pls_fork_module.c +++ b/src/mca/pls/fork/pls_fork_module.c @@ -218,6 +218,11 @@ static int orte_pls_fork_proc( free(param); free(uri); + /* use same nodename as the starting daemon (us) */ + param = mca_base_param_environ_variable("orte", "base", "nodename"); + ompi_setenv(param, orte_system_info.nodename, true, &environ_copy); + free(param); + /* push name into environment */ orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range, &environ_copy); diff --git a/src/mca/pls/rsh/pls_rsh_module.c b/src/mca/pls/rsh/pls_rsh_module.c index c38b10e4df..8734d30597 100644 --- a/src/mca/pls/rsh/pls_rsh_module.c +++ b/src/mca/pls/rsh/pls_rsh_module.c @@ -45,6 +45,8 @@ #include "mca/base/mca_base_param.h" #include "mca/ns/ns.h" +#include "util/sys_info.h" +#include "util/if.h" #include "mca/pls/pls.h" #include "mca/rml/rml.h" #include "mca/gpr/gpr.h" @@ -376,7 +378,26 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) char **exec_argv; /* setup node name */ - argv[node_name_index1] = node->node_name; + /* this is a bit convoluted. If we are ssh'ing to the + * current machine, we normally want to use the current + * machine's nodename (aka hostname), so that ssh keys are + * found correctly. In some situations (laptops that have + * unresolveable names), we really want to use "localhost" + * instead of nodename. But we don't want to use + * "localhost" all the time, as it makes life difficult + * with ssh keys on shared filesystems. Generally, if you + * have a shared filesystem, you have a resolveable + * nodename, so all should be good. + * + * ompi_ifislocal() will return false if the name isn't + * resolveable (since it needs to resolve the name). + */ + if (0 == strcmp(node->node_name, orte_system_info.nodename) && + ! ompi_ifislocal(node->node_name)) { + argv[node_name_index1] = "localhost"; + } else { + argv[node_name_index1] = node->node_name; + } argv[node_name_index2] = node->node_name; /* initialize daemons process name */ diff --git a/src/mca/rds/hostfile/rds_hostfile.c b/src/mca/rds/hostfile/rds_hostfile.c index 90a120590a..04fa37fb4f 100644 --- a/src/mca/rds/hostfile/rds_hostfile.c +++ b/src/mca/rds/hostfile/rds_hostfile.c @@ -34,10 +34,6 @@ #include "runtime/runtime_types.h" -/* - * Local global variable - */ -static bool localhost_found; static void orte_rds_hostfile_parse_error(void) { @@ -81,32 +77,9 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l if (ORTE_RDS_HOSTFILE_STRING == token) { char* node_name = orte_rds_hostfile_value.sval; - /* If a line for "localhost" was included, we do NOT allow - * any other hosts to be specified in the file. This is due to the - * vaguery of the "nodename" parameter returned by Linux system calls. - * See the man page for uname for a detailed explanation - */ - if (0 == strcmp(node_name, "localhost")) { - - /* If the size of the updates list == 1 and it only - contains localhost, or if the size of the updates list - == 0, we're ok. Otherwise, this is an error. The - positive logic test was a little clearer than a - negative logic check, so even though this results in - potentially re-setting localhost_found=true multiple - times (if "localhost" is included multiple times in the - file), the code is clearer this way. */ - - if (0 == ompi_list_get_size(updates) || - (1 == ompi_list_get_size(updates) && localhost_found)) { - localhost_found = true; - } else { - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - return ORTE_ERR_VALUE_OUT_OF_BOUNDS; - } - } else if (localhost_found) { - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - return ORTE_ERR_VALUE_OUT_OF_BOUNDS; + /* convert this into something globally unique */ + if(strcmp(node_name, "localhost") == 0) { + node_name = orte_system_info.nodename; } /* Do we need to make a new node object? First check to see @@ -279,9 +252,6 @@ static int orte_rds_hostfile_query(void) ompi_list_item_t *item; int rc; - /* initialize the localhost_found flag */ - localhost_found = false; - OBJ_CONSTRUCT(&existing, ompi_list_t); OBJ_CONSTRUCT(&updates, ompi_list_t); rc = orte_ras_base_node_query(&existing); diff --git a/src/util/if.c b/src/util/if.c index 3622432e28..897b5a5c92 100644 --- a/src/util/if.c +++ b/src/util/if.c @@ -622,6 +622,13 @@ ompi_ifislocal(char *hostname) { char addrname[ADDRLEN - 1]; int ret; + struct hostent *h; + + /* ompi_ifaddrtoname will complain (rightly) if hostname is not + resolveable. check to make sure it's resolveable. If not, + definitely not local... */ + h = gethostbyname(hostname); + if (NULL == h) return false; ret = ompi_ifaddrtoname(hostname, addrname, ADDRLEN); if (OMPI_SUCCESS == ret) return true; diff --git a/src/util/sys_info.c b/src/util/sys_info.c index e3aa14cbd1..861f66c14f 100644 --- a/src/util/sys_info.c +++ b/src/util/sys_info.c @@ -33,6 +33,8 @@ #include "mca/errmgr/errmgr.h" #include "util/output.h" #include "util/printf.h" +#include "mca/base/mca_base_param.h" + #include "util/sys_info.h" @@ -92,7 +94,10 @@ int orte_sys_info(void) } else { orte_system_info.sysname = strdup(sys_info.sysname); if (NULL == orte_system_info.nodename) { - orte_system_info.nodename = strdup(sys_info.nodename); + /* make sure we weren't given a nodename by environment */ + int id = mca_base_param_register_string("orte", "base", "nodename", + NULL, sys_info.nodename); + mca_base_param_lookup_string(id, &(orte_system_info.nodename)); } orte_system_info.release = strdup(sys_info.release); orte_system_info.version = strdup(sys_info.version);