- revert to resolving "localhost" to the contents of
orte_system_info.nodename so that cleanup and the like occur correctly. Otherwise, the daemon on localhost and an MPI process can have different ideas on what the local nodename is, and that lead to all kinds of badness with both process killing and cleanup. Also fixes the annoying ssh keys problem when sshing to localhost. - modify the rsh pls to ssh to localhost if the target nodename is the same as orte_system_info.nodename AND is not resolvable (ie, ssh to would fail). Otherwise, ssh to nodename. This should work around the issues Ralph was seeing with ssh failing on his laptop (since the above change undid the previous fix to this problem). - Small change to ompi_ifislocal() to squelch a warning message about unresolvable hostnames when checking to see if a name is, in fact, resolvable. - Force ORTE process to have same nodename field as it's starting daemon (assuming it was started using the fork pls), so that the fork pls can properly kill the process, and cleanup its session directory on abnormal exit. This commit was SVN r5914.
Этот коммит содержится в:
родитель
465b54a3f0
Коммит
8b411a10be
@ -218,6 +218,11 @@ static int orte_pls_fork_proc(
|
|||||||
free(param);
|
free(param);
|
||||||
free(uri);
|
free(uri);
|
||||||
|
|
||||||
|
/* use same nodename as the starting daemon (us) */
|
||||||
|
param = mca_base_param_environ_variable("orte", "base", "nodename");
|
||||||
|
ompi_setenv(param, orte_system_info.nodename, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
|
||||||
/* push name into environment */
|
/* push name into environment */
|
||||||
orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range,
|
orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range,
|
||||||
&environ_copy);
|
&environ_copy);
|
||||||
|
@ -45,6 +45,8 @@
|
|||||||
#include "mca/base/mca_base_param.h"
|
#include "mca/base/mca_base_param.h"
|
||||||
|
|
||||||
#include "mca/ns/ns.h"
|
#include "mca/ns/ns.h"
|
||||||
|
#include "util/sys_info.h"
|
||||||
|
#include "util/if.h"
|
||||||
#include "mca/pls/pls.h"
|
#include "mca/pls/pls.h"
|
||||||
#include "mca/rml/rml.h"
|
#include "mca/rml/rml.h"
|
||||||
#include "mca/gpr/gpr.h"
|
#include "mca/gpr/gpr.h"
|
||||||
@ -376,7 +378,26 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
char **exec_argv;
|
char **exec_argv;
|
||||||
|
|
||||||
/* setup node name */
|
/* setup node name */
|
||||||
argv[node_name_index1] = node->node_name;
|
/* this is a bit convoluted. If we are ssh'ing to the
|
||||||
|
* current machine, we normally want to use the current
|
||||||
|
* machine's nodename (aka hostname), so that ssh keys are
|
||||||
|
* found correctly. In some situations (laptops that have
|
||||||
|
* unresolveable names), we really want to use "localhost"
|
||||||
|
* instead of nodename. But we don't want to use
|
||||||
|
* "localhost" all the time, as it makes life difficult
|
||||||
|
* with ssh keys on shared filesystems. Generally, if you
|
||||||
|
* have a shared filesystem, you have a resolveable
|
||||||
|
* nodename, so all should be good.
|
||||||
|
*
|
||||||
|
* ompi_ifislocal() will return false if the name isn't
|
||||||
|
* resolveable (since it needs to resolve the name).
|
||||||
|
*/
|
||||||
|
if (0 == strcmp(node->node_name, orte_system_info.nodename) &&
|
||||||
|
! ompi_ifislocal(node->node_name)) {
|
||||||
|
argv[node_name_index1] = "localhost";
|
||||||
|
} else {
|
||||||
|
argv[node_name_index1] = node->node_name;
|
||||||
|
}
|
||||||
argv[node_name_index2] = node->node_name;
|
argv[node_name_index2] = node->node_name;
|
||||||
|
|
||||||
/* initialize daemons process name */
|
/* initialize daemons process name */
|
||||||
|
@ -34,10 +34,6 @@
|
|||||||
|
|
||||||
#include "runtime/runtime_types.h"
|
#include "runtime/runtime_types.h"
|
||||||
|
|
||||||
/*
|
|
||||||
* Local global variable
|
|
||||||
*/
|
|
||||||
static bool localhost_found;
|
|
||||||
|
|
||||||
static void orte_rds_hostfile_parse_error(void)
|
static void orte_rds_hostfile_parse_error(void)
|
||||||
{
|
{
|
||||||
@ -81,32 +77,9 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
|||||||
if (ORTE_RDS_HOSTFILE_STRING == token) {
|
if (ORTE_RDS_HOSTFILE_STRING == token) {
|
||||||
char* node_name = orte_rds_hostfile_value.sval;
|
char* node_name = orte_rds_hostfile_value.sval;
|
||||||
|
|
||||||
/* If a line for "localhost" was included, we do NOT allow
|
/* convert this into something globally unique */
|
||||||
* any other hosts to be specified in the file. This is due to the
|
if(strcmp(node_name, "localhost") == 0) {
|
||||||
* vaguery of the "nodename" parameter returned by Linux system calls.
|
node_name = orte_system_info.nodename;
|
||||||
* See the man page for uname for a detailed explanation
|
|
||||||
*/
|
|
||||||
if (0 == strcmp(node_name, "localhost")) {
|
|
||||||
|
|
||||||
/* If the size of the updates list == 1 and it only
|
|
||||||
contains localhost, or if the size of the updates list
|
|
||||||
== 0, we're ok. Otherwise, this is an error. The
|
|
||||||
positive logic test was a little clearer than a
|
|
||||||
negative logic check, so even though this results in
|
|
||||||
potentially re-setting localhost_found=true multiple
|
|
||||||
times (if "localhost" is included multiple times in the
|
|
||||||
file), the code is clearer this way. */
|
|
||||||
|
|
||||||
if (0 == ompi_list_get_size(updates) ||
|
|
||||||
(1 == ompi_list_get_size(updates) && localhost_found)) {
|
|
||||||
localhost_found = true;
|
|
||||||
} else {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
|
||||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
|
||||||
}
|
|
||||||
} else if (localhost_found) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
|
||||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Do we need to make a new node object? First check to see
|
/* Do we need to make a new node object? First check to see
|
||||||
@ -279,9 +252,6 @@ static int orte_rds_hostfile_query(void)
|
|||||||
ompi_list_item_t *item;
|
ompi_list_item_t *item;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
/* initialize the localhost_found flag */
|
|
||||||
localhost_found = false;
|
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&existing, ompi_list_t);
|
OBJ_CONSTRUCT(&existing, ompi_list_t);
|
||||||
OBJ_CONSTRUCT(&updates, ompi_list_t);
|
OBJ_CONSTRUCT(&updates, ompi_list_t);
|
||||||
rc = orte_ras_base_node_query(&existing);
|
rc = orte_ras_base_node_query(&existing);
|
||||||
|
@ -622,6 +622,13 @@ ompi_ifislocal(char *hostname)
|
|||||||
{
|
{
|
||||||
char addrname[ADDRLEN - 1];
|
char addrname[ADDRLEN - 1];
|
||||||
int ret;
|
int ret;
|
||||||
|
struct hostent *h;
|
||||||
|
|
||||||
|
/* ompi_ifaddrtoname will complain (rightly) if hostname is not
|
||||||
|
resolveable. check to make sure it's resolveable. If not,
|
||||||
|
definitely not local... */
|
||||||
|
h = gethostbyname(hostname);
|
||||||
|
if (NULL == h) return false;
|
||||||
|
|
||||||
ret = ompi_ifaddrtoname(hostname, addrname, ADDRLEN);
|
ret = ompi_ifaddrtoname(hostname, addrname, ADDRLEN);
|
||||||
if (OMPI_SUCCESS == ret) return true;
|
if (OMPI_SUCCESS == ret) return true;
|
||||||
|
@ -33,6 +33,8 @@
|
|||||||
#include "mca/errmgr/errmgr.h"
|
#include "mca/errmgr/errmgr.h"
|
||||||
#include "util/output.h"
|
#include "util/output.h"
|
||||||
#include "util/printf.h"
|
#include "util/printf.h"
|
||||||
|
#include "mca/base/mca_base_param.h"
|
||||||
|
|
||||||
|
|
||||||
#include "util/sys_info.h"
|
#include "util/sys_info.h"
|
||||||
|
|
||||||
@ -92,7 +94,10 @@ int orte_sys_info(void)
|
|||||||
} else {
|
} else {
|
||||||
orte_system_info.sysname = strdup(sys_info.sysname);
|
orte_system_info.sysname = strdup(sys_info.sysname);
|
||||||
if (NULL == orte_system_info.nodename) {
|
if (NULL == orte_system_info.nodename) {
|
||||||
orte_system_info.nodename = strdup(sys_info.nodename);
|
/* make sure we weren't given a nodename by environment */
|
||||||
|
int id = mca_base_param_register_string("orte", "base", "nodename",
|
||||||
|
NULL, sys_info.nodename);
|
||||||
|
mca_base_param_lookup_string(id, &(orte_system_info.nodename));
|
||||||
}
|
}
|
||||||
orte_system_info.release = strdup(sys_info.release);
|
orte_system_info.release = strdup(sys_info.release);
|
||||||
orte_system_info.version = strdup(sys_info.version);
|
orte_system_info.version = strdup(sys_info.version);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user