Handle the case where the allocator "fibs" to us about the node names. In some cases (ahem...you know who you are!), the allocator will tell us a node number (e.g., "16"). However, the daemon will return a node name (e.g., "nid0016") - leaving us not recognizing its location.
So provide a new parameter (can't have too many!) that handles this situation by stripping the prefix from the returned node name. Also do a little cleanup to ensure we cleanly exit from errors, without generating too many annoying messages. This commit was SVN r25562.
Этот коммит содержится в:
родитель
bdc7f7a4ef
Коммит
07655e2945
@ -77,6 +77,9 @@ orte_tmpdir_base = /var/tmp
|
||||
## from inadvertent job executions
|
||||
orte_allocation_required = 1
|
||||
|
||||
## Deal with the allocator
|
||||
plm_base_strip_prefix_from_node_names = 1
|
||||
|
||||
## MPI behavior
|
||||
## Do NOT specify mpi_leave_pinned so system
|
||||
## can figure out for itself whether or not
|
||||
|
@ -77,6 +77,9 @@ orte_tmpdir_base = /var/tmp
|
||||
## from inadvertent job executions
|
||||
orte_allocation_required = 1
|
||||
|
||||
## Deal with the allocator
|
||||
plm_base_strip_prefix_from_node_names = 1
|
||||
|
||||
## MPI behavior
|
||||
## Do NOT specify mpi_leave_pinned so system
|
||||
## can figure out for itself whether or not
|
||||
|
@ -1,5 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -241,6 +241,25 @@ static int update_state(orte_jobid_t job,
|
||||
default_hnp_abort(jdata->jobid, sts);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_JOB_STATE_SILENT_ABORT:
|
||||
failed_start(jdata);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
|
||||
/* set the flag indicating that a daemon failed so we use the proper
|
||||
* methods for attempting to shutdown the rest of the system
|
||||
*/
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
default_hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_JOB_STATE_RUNNING:
|
||||
/* update all procs in job */
|
||||
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0);
|
||||
|
@ -326,10 +326,11 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
the ALPS plm) */
|
||||
cur_prefix = NULL;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
char *app_prefix_dir;
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
char * app_prefix_dir = app->prefix_dir;
|
||||
app_prefix_dir = app->prefix_dir;
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
|
@ -23,6 +23,15 @@ This is an unusual error; it means that Open RTE was unable to find
|
||||
any mechanism to launch proceses, and therefore is unable to start the
|
||||
process(es) required by your application.
|
||||
#
|
||||
[daemon-no-assigned-node]
|
||||
A daemon has no recorded node:
|
||||
|
||||
Daemon: %s
|
||||
Reported from nodename: %s
|
||||
|
||||
This usually indicates a difference between the names of nodes in the
|
||||
allocation versus what is returned on the node by get_hostname.
|
||||
#
|
||||
[daemon-died-no-signal]
|
||||
A daemon died unexpectedly with status %d while attempting
|
||||
to launch so we are aborting.
|
||||
|
@ -31,6 +31,7 @@
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
@ -464,41 +465,47 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:orted_report_launch from daemon %s on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer), nodename));
|
||||
|
||||
/* look this node up, if necessary */
|
||||
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
|
||||
if (!orte_have_fqdn_allocation) {
|
||||
/* remove any domain info */
|
||||
if (NULL != (ptr = strchr(nodename, '.'))) {
|
||||
*ptr = '\0';
|
||||
ptr = strdup(nodename);
|
||||
free(nodename);
|
||||
nodename = ptr;
|
||||
}
|
||||
}
|
||||
if (orte_plm_globals.strip_prefix_from_node_names) {
|
||||
/* remove all leading characters and zeroes */
|
||||
ptr = nodename;
|
||||
while (idx < (int)strlen(nodename) &&
|
||||
(isalpha(nodename[idx]) || '0' == nodename[idx])) {
|
||||
idx++;
|
||||
}
|
||||
if (idx == (int)strlen(nodename)) {
|
||||
ptr = strdup(nodename);
|
||||
} else {
|
||||
ptr = strdup(&nodename[idx]);
|
||||
}
|
||||
free(nodename);
|
||||
nodename = ptr;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:orted_report_launch attempting to assign daemon %s to node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer), nodename));
|
||||
for (idx=0; idx < orte_node_pool->size; idx++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, idx))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != node->daemon) {
|
||||
/* already known */
|
||||
if (0 == strcmp(nodename, node->name)) {
|
||||
/* this shouldn't happen, but protect against it just in case */
|
||||
opal_output(0, "%s Node %s already has daemon %s assigned to it - assigning daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nodename, ORTE_NAME_PRINT(&node->daemon->name),
|
||||
ORTE_NAME_PRINT(&daemon->name));
|
||||
if (NULL != node->daemon->node) {
|
||||
OBJ_RELEASE(node->daemon->node);
|
||||
node->daemon->node = NULL;
|
||||
}
|
||||
OBJ_RELEASE(node->daemon);
|
||||
node->daemon = daemon;
|
||||
OBJ_RETAIN(daemon);
|
||||
if (NULL != daemon->node) {
|
||||
OBJ_RELEASE(daemon->node);
|
||||
}
|
||||
daemon->node = node;
|
||||
daemon->nodename = node->name;
|
||||
OBJ_RETAIN(node);
|
||||
break;
|
||||
}
|
||||
/* already assigned */
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(nodename, node->name)) {
|
||||
@ -531,10 +538,8 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
/* this shouldn't happen - it indicates an error in the
|
||||
* prior node matching logic, so report it and error out
|
||||
*/
|
||||
opal_output(0, "%s Daemon %s has no recorded node - returned nodename %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&daemon->name), nodename);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true,
|
||||
ORTE_NAME_PRINT(&daemon->name), nodename);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -594,8 +599,8 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
OBJ_RELEASE(mev);
|
||||
|
||||
if (orted_failed_launch) {
|
||||
if( NULL != rml_uri ) free(rml_uri);
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid,
|
||||
ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_FAILED_TO_START,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
@ -656,9 +661,13 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
ORTE_PROGRESSED_WAIT(orted_failed_launch, orted_num_callback, num_daemons);
|
||||
|
||||
/* cancel the lingering recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK);
|
||||
|
||||
if (orted_failed_launch) {
|
||||
/* we will have already emitted an error log or show
|
||||
* help, so exit quietly from here
|
||||
*/
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
|
@ -89,6 +89,8 @@ orte_plm_base_module_t orte_plm = {
|
||||
*/
|
||||
int orte_plm_base_open(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_plm_globals.output = opal_output_open(NULL);
|
||||
@ -113,6 +115,11 @@ int orte_plm_base_open(void)
|
||||
/* default to assigning daemons to nodes at launch */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
|
||||
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
|
||||
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
||||
false, false, (int)false, &value);
|
||||
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* Open up all the components that we can find */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -73,6 +75,8 @@ typedef struct {
|
||||
opal_buffer_t tree_spawn_cmd;
|
||||
/* daemon nodes assigned at launch */
|
||||
bool daemon_nodes_assigned_at_launch;
|
||||
/* handle allocator-to-actual nodename matches */
|
||||
bool strip_prefix_from_node_names;
|
||||
} orte_plm_globals_t;
|
||||
/**
|
||||
* Global instance of PLM framework data
|
||||
|
@ -101,6 +101,7 @@ typedef uint32_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
||||
#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */
|
||||
#define ORTE_JOB_STATE_SILENT_ABORT 0x00080000 /* an error occurred and was reported elsewhere, so error out quietly */
|
||||
|
||||
/* the job never even attempted to launch due to an error earlier in the
|
||||
* launch procedure
|
||||
|
@ -11,6 +11,8 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -185,3 +187,13 @@ Unknown binding level:
|
||||
|
||||
Target: %s
|
||||
Cache level: %u
|
||||
#
|
||||
[orte-rmaps-base:missing-daemon]
|
||||
While attempting to build a map of this job, a node
|
||||
was detected to be missing a daemon:
|
||||
|
||||
Node: %s
|
||||
|
||||
This usually indicates a mismatch between what the
|
||||
allocation provided for the node name versus what was
|
||||
actually found on the node.
|
||||
|
@ -87,6 +87,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
nd = NULL;
|
||||
} else {
|
||||
nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
|
||||
/* sanity check */
|
||||
if (NULL == nd->daemon) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:missing-daemon",
|
||||
true, nd->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
for (i=1; i < orte_node_pool->size; i++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
@ -108,6 +115,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
*/
|
||||
OBJ_RETAIN(node);
|
||||
node->mapped = false;
|
||||
/* quick sanity check */
|
||||
if (NULL == node->daemon) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
"orte-rmaps-base:missing-daemon",
|
||||
true, node->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (NULL == nd || nd->daemon->name.vpid < node->daemon->name.vpid) {
|
||||
/* just append to end */
|
||||
opal_list_append(allocated_nodes, &node->super);
|
||||
|
@ -239,6 +239,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "PROCS MIGRATING";
|
||||
case ORTE_JOB_STATE_NON_ZERO_TERM:
|
||||
return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS";
|
||||
case ORTE_JOB_STATE_SILENT_ABORT:
|
||||
return "ERROR REPORTED ELSEWHERE";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user