1
1

Handle the case where the allocator "fibs" to us about the node names. In some cases (ahem...you know who you are!), the allocator will tell us a node number (e.g., "16"). However, the daemon will return a node name (e.g., "nid0016") - leaving us not recognizing its location.

So provide a new parameter (can't have too many!) that handles this situation by stripping the prefix from the returned node name. Also do a little cleanup to ensure we cleanly exit from errors, without generating too many annoying messages.

This commit was SVN r25562.
Этот коммит содержится в:
Ralph Castain 2011-12-02 14:10:08 +00:00
родитель bdc7f7a4ef
Коммит 07655e2945
13 изменённых файлов: 118 добавлений и 32 удалений

Просмотреть файл

@ -77,6 +77,9 @@ orte_tmpdir_base = /var/tmp
## from inadvertent job executions
orte_allocation_required = 1
## Deal with the allocator
plm_base_strip_prefix_from_node_names = 1
## MPI behavior
## Do NOT specify mpi_leave_pinned so system
## can figure out for itself whether or not

Просмотреть файл

@ -77,6 +77,9 @@ orte_tmpdir_base = /var/tmp
## from inadvertent job executions
orte_allocation_required = 1
## Deal with the allocator
plm_base_strip_prefix_from_node_names = 1
## MPI behavior
## Do NOT specify mpi_leave_pinned so system
## can figure out for itself whether or not

Просмотреть файл

@ -1,5 +1,7 @@
/*
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -241,6 +241,25 @@ static int update_state(orte_jobid_t job,
default_hnp_abort(jdata->jobid, sts);
}
break;
case ORTE_JOB_STATE_SILENT_ABORT:
failed_start(jdata);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
/* set the flag indicating that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
orte_abnormal_term_ordered = true;
}
default_hnp_abort(jdata->jobid, exit_code);
}
break;
case ORTE_JOB_STATE_RUNNING:
/* update all procs in job */
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0);

Просмотреть файл

@ -326,10 +326,11 @@ static int plm_alps_launch_job(orte_job_t *jdata)
the ALPS plm) */
cur_prefix = NULL;
for (i=0; i < jdata->apps->size; i++) {
char *app_prefix_dir;
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
char * app_prefix_dir = app->prefix_dir;
app_prefix_dir = app->prefix_dir;
/* Check for already set cur_prefix_dir -- if different,
complain */
if (NULL != app_prefix_dir) {

Просмотреть файл

@ -23,6 +23,15 @@ This is an unusual error; it means that Open RTE was unable to find
any mechanism to launch proceses, and therefore is unable to start the
process(es) required by your application.
#
[daemon-no-assigned-node]
A daemon has no recorded node:
Daemon: %s
Reported from nodename: %s
This usually indicates a difference between the names of nodes in the
allocation versus what is returned on the node by get_hostname.
#
[daemon-died-no-signal]
A daemon died unexpectedly with status %d while attempting
to launch so we are aborting.

Просмотреть файл

@ -31,6 +31,7 @@
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/runtime/opal_progress.h"
@ -464,41 +465,47 @@ static void process_orted_launch_report(int fd, short event, void *data)
goto CLEANUP;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch from daemon %s on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer), nodename));
/* look this node up, if necessary */
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
if (!orte_have_fqdn_allocation) {
/* remove any domain info */
if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0';
ptr = strdup(nodename);
free(nodename);
nodename = ptr;
}
}
if (orte_plm_globals.strip_prefix_from_node_names) {
/* remove all leading characters and zeroes */
ptr = nodename;
while (idx < (int)strlen(nodename) &&
(isalpha(nodename[idx]) || '0' == nodename[idx])) {
idx++;
}
if (idx == (int)strlen(nodename)) {
ptr = strdup(nodename);
} else {
ptr = strdup(&nodename[idx]);
}
free(nodename);
nodename = ptr;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch attempting to assign daemon %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer), nodename));
for (idx=0; idx < orte_node_pool->size; idx++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) {
continue;
}
if (NULL != node->daemon) {
/* already known */
if (0 == strcmp(nodename, node->name)) {
/* this shouldn't happen, but protect against it just in case */
opal_output(0, "%s Node %s already has daemon %s assigned to it - assigning daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nodename, ORTE_NAME_PRINT(&node->daemon->name),
ORTE_NAME_PRINT(&daemon->name));
if (NULL != node->daemon->node) {
OBJ_RELEASE(node->daemon->node);
node->daemon->node = NULL;
}
OBJ_RELEASE(node->daemon);
node->daemon = daemon;
OBJ_RETAIN(daemon);
if (NULL != daemon->node) {
OBJ_RELEASE(daemon->node);
}
daemon->node = node;
daemon->nodename = node->name;
OBJ_RETAIN(node);
break;
}
/* already assigned */
continue;
}
if (0 == strcmp(nodename, node->name)) {
@ -531,10 +538,8 @@ static void process_orted_launch_report(int fd, short event, void *data)
/* this shouldn't happen - it indicates an error in the
* prior node matching logic, so report it and error out
*/
opal_output(0, "%s Daemon %s has no recorded node - returned nodename %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&daemon->name), nodename);
rc = ORTE_ERR_FATAL;
orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true,
ORTE_NAME_PRINT(&daemon->name), nodename);
orted_failed_launch = true;
goto CLEANUP;
}
@ -594,8 +599,8 @@ static void process_orted_launch_report(int fd, short event, void *data)
OBJ_RELEASE(mev);
if (orted_failed_launch) {
if( NULL != rml_uri ) free(rml_uri);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid,
ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_FAILED_TO_START,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
@ -656,9 +661,13 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
ORTE_PROGRESSED_WAIT(orted_failed_launch, orted_num_callback, num_daemons);
/* cancel the lingering recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK))) {
ORTE_ERROR_LOG(rc);
return rc;
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK);
if (orted_failed_launch) {
/* we will have already emitted an error log or show
* help, so exit quietly from here
*/
return ORTE_ERR_SILENT;
}
#if OPAL_HAVE_HWLOC

Просмотреть файл

@ -89,6 +89,8 @@ orte_plm_base_module_t orte_plm = {
*/
int orte_plm_base_open(void)
{
int value;
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_plm_globals.output = opal_output_open(NULL);
@ -113,6 +115,11 @@ int orte_plm_base_open(void)
/* default to assigning daemons to nodes at launch */
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
"Whether to strip leading characters and zeroes from node names returned by daemons",
false, false, (int)false, &value);
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -73,6 +75,8 @@ typedef struct {
opal_buffer_t tree_spawn_cmd;
/* daemon nodes assigned at launch */
bool daemon_nodes_assigned_at_launch;
/* handle allocator-to-actual nodename matches */
bool strip_prefix_from_node_names;
} orte_plm_globals_t;
/**
* Global instance of PLM framework data

Просмотреть файл

@ -101,6 +101,7 @@ typedef uint32_t orte_job_state_t;
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */
#define ORTE_JOB_STATE_SILENT_ABORT 0x00080000 /* an error occurred and was reported elsewhere, so error out quietly */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure

Просмотреть файл

@ -11,6 +11,8 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -185,3 +187,13 @@ Unknown binding level:
Target: %s
Cache level: %u
#
[orte-rmaps-base:missing-daemon]
While attempting to build a map of this job, a node
was detected to be missing a daemon:
Node: %s
This usually indicates a mismatch between what the
allocation provided for the node name versus what was
actually found on the node.

Просмотреть файл

@ -87,6 +87,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
nd = NULL;
} else {
nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
/* sanity check */
if (NULL == nd->daemon) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:missing-daemon",
true, nd->name);
return ORTE_ERR_SILENT;
}
}
for (i=1; i < orte_node_pool->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
@ -108,6 +115,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
*/
OBJ_RETAIN(node);
node->mapped = false;
/* quick sanity check */
if (NULL == node->daemon) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:missing-daemon",
true, node->name);
return ORTE_ERR_SILENT;
}
if (NULL == nd || nd->daemon->name.vpid < node->daemon->name.vpid) {
/* just append to end */
opal_list_append(allocated_nodes, &node->super);

Просмотреть файл

@ -239,6 +239,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "PROCS MIGRATING";
case ORTE_JOB_STATE_NON_ZERO_TERM:
return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS";
case ORTE_JOB_STATE_SILENT_ABORT:
return "ERROR REPORTED ELSEWHERE";
default:
return "UNKNOWN STATE!";
}