1
1

- revert to resolving "localhost" to the contents of

orte_system_info.nodename so that cleanup and the like occur
  correctly.  Otherwise, the daemon on localhost and an MPI process
  can have different ideas on what the local nodename is, and that
  lead to all kinds of badness with both process killing and cleanup.
  Also fixes the annoying ssh keys problem when sshing to localhost.
- modify the rsh pls to ssh to localhost if the target nodename is the
  same as orte_system_info.nodename AND is not resolvable (ie, ssh to
  would fail).  Otherwise, ssh to nodename.  This should work around
  the issues Ralph was seeing with ssh failing on his laptop (since
  the above change undid the previous fix to this problem).
- Small change to ompi_ifislocal() to squelch a warning message about
  unresolvable hostnames when checking to see if a name is, in fact,
  resolvable.
- Force ORTE process to have same nodename field as it's starting
  daemon (assuming it was started using the fork pls), so that the
  fork pls can properly kill the process, and cleanup its session
  directory on abnormal exit.

This commit was SVN r5914.
Этот коммит содержится в:
Brian Barrett 2005-06-01 19:30:05 +00:00
родитель 465b54a3f0
Коммит 8b411a10be
5 изменённых файлов: 43 добавлений и 35 удалений

Просмотреть файл

@ -218,6 +218,11 @@ static int orte_pls_fork_proc(
free(param); free(param);
free(uri); free(uri);
/* use same nodename as the starting daemon (us) */
param = mca_base_param_environ_variable("orte", "base", "nodename");
ompi_setenv(param, orte_system_info.nodename, true, &environ_copy);
free(param);
/* push name into environment */ /* push name into environment */
orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range, orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range,
&environ_copy); &environ_copy);

Просмотреть файл

@ -45,6 +45,8 @@
#include "mca/base/mca_base_param.h" #include "mca/base/mca_base_param.h"
#include "mca/ns/ns.h" #include "mca/ns/ns.h"
#include "util/sys_info.h"
#include "util/if.h"
#include "mca/pls/pls.h" #include "mca/pls/pls.h"
#include "mca/rml/rml.h" #include "mca/rml/rml.h"
#include "mca/gpr/gpr.h" #include "mca/gpr/gpr.h"
@ -376,7 +378,26 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
char **exec_argv; char **exec_argv;
/* setup node name */ /* setup node name */
argv[node_name_index1] = node->node_name; /* this is a bit convoluted. If we are ssh'ing to the
* current machine, we normally want to use the current
* machine's nodename (aka hostname), so that ssh keys are
* found correctly. In some situations (laptops that have
* unresolveable names), we really want to use "localhost"
* instead of nodename. But we don't want to use
* "localhost" all the time, as it makes life difficult
* with ssh keys on shared filesystems. Generally, if you
* have a shared filesystem, you have a resolveable
* nodename, so all should be good.
*
* ompi_ifislocal() will return false if the name isn't
* resolveable (since it needs to resolve the name).
*/
if (0 == strcmp(node->node_name, orte_system_info.nodename) &&
! ompi_ifislocal(node->node_name)) {
argv[node_name_index1] = "localhost";
} else {
argv[node_name_index1] = node->node_name;
}
argv[node_name_index2] = node->node_name; argv[node_name_index2] = node->node_name;
/* initialize daemons process name */ /* initialize daemons process name */

Просмотреть файл

@ -34,10 +34,6 @@
#include "runtime/runtime_types.h" #include "runtime/runtime_types.h"
/*
* Local global variable
*/
static bool localhost_found;
static void orte_rds_hostfile_parse_error(void) static void orte_rds_hostfile_parse_error(void)
{ {
@ -81,32 +77,9 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
if (ORTE_RDS_HOSTFILE_STRING == token) { if (ORTE_RDS_HOSTFILE_STRING == token) {
char* node_name = orte_rds_hostfile_value.sval; char* node_name = orte_rds_hostfile_value.sval;
/* If a line for "localhost" was included, we do NOT allow /* convert this into something globally unique */
* any other hosts to be specified in the file. This is due to the if(strcmp(node_name, "localhost") == 0) {
* vaguery of the "nodename" parameter returned by Linux system calls. node_name = orte_system_info.nodename;
* See the man page for uname for a detailed explanation
*/
if (0 == strcmp(node_name, "localhost")) {
/* If the size of the updates list == 1 and it only
contains localhost, or if the size of the updates list
== 0, we're ok. Otherwise, this is an error. The
positive logic test was a little clearer than a
negative logic check, so even though this results in
potentially re-setting localhost_found=true multiple
times (if "localhost" is included multiple times in the
file), the code is clearer this way. */
if (0 == ompi_list_get_size(updates) ||
(1 == ompi_list_get_size(updates) && localhost_found)) {
localhost_found = true;
} else {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
}
} else if (localhost_found) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
} }
/* Do we need to make a new node object? First check to see /* Do we need to make a new node object? First check to see
@ -279,9 +252,6 @@ static int orte_rds_hostfile_query(void)
ompi_list_item_t *item; ompi_list_item_t *item;
int rc; int rc;
/* initialize the localhost_found flag */
localhost_found = false;
OBJ_CONSTRUCT(&existing, ompi_list_t); OBJ_CONSTRUCT(&existing, ompi_list_t);
OBJ_CONSTRUCT(&updates, ompi_list_t); OBJ_CONSTRUCT(&updates, ompi_list_t);
rc = orte_ras_base_node_query(&existing); rc = orte_ras_base_node_query(&existing);

Просмотреть файл

@ -622,6 +622,13 @@ ompi_ifislocal(char *hostname)
{ {
char addrname[ADDRLEN - 1]; char addrname[ADDRLEN - 1];
int ret; int ret;
struct hostent *h;
/* ompi_ifaddrtoname will complain (rightly) if hostname is not
resolveable. check to make sure it's resolveable. If not,
definitely not local... */
h = gethostbyname(hostname);
if (NULL == h) return false;
ret = ompi_ifaddrtoname(hostname, addrname, ADDRLEN); ret = ompi_ifaddrtoname(hostname, addrname, ADDRLEN);
if (OMPI_SUCCESS == ret) return true; if (OMPI_SUCCESS == ret) return true;

Просмотреть файл

@ -33,6 +33,8 @@
#include "mca/errmgr/errmgr.h" #include "mca/errmgr/errmgr.h"
#include "util/output.h" #include "util/output.h"
#include "util/printf.h" #include "util/printf.h"
#include "mca/base/mca_base_param.h"
#include "util/sys_info.h" #include "util/sys_info.h"
@ -92,7 +94,10 @@ int orte_sys_info(void)
} else { } else {
orte_system_info.sysname = strdup(sys_info.sysname); orte_system_info.sysname = strdup(sys_info.sysname);
if (NULL == orte_system_info.nodename) { if (NULL == orte_system_info.nodename) {
orte_system_info.nodename = strdup(sys_info.nodename); /* make sure we weren't given a nodename by environment */
int id = mca_base_param_register_string("orte", "base", "nodename",
NULL, sys_info.nodename);
mca_base_param_lookup_string(id, &(orte_system_info.nodename));
} }
orte_system_info.release = strdup(sys_info.release); orte_system_info.release = strdup(sys_info.release);
orte_system_info.version = strdup(sys_info.version); orte_system_info.version = strdup(sys_info.version);