2010-07-18 01:03:27 +04:00
|
|
|
/* -*- C -*-
|
|
|
|
*
|
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
2010-07-18 01:03:27 +04:00
|
|
|
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
2012-04-06 18:23:13 +04:00
|
|
|
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
2010-07-18 01:03:27 +04:00
|
|
|
* reserved.
|
2012-03-09 01:55:19 +04:00
|
|
|
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
2010-07-18 01:03:27 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
|
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
|
|
#include <sys/param.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
|
|
|
#include <signal.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#endif /* HAVE_SYS_WAIT_H */
|
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
#endif /* HAVE_SYS_TIME_H */
|
|
|
|
|
|
|
|
#include "orte/mca/plm/plm.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/routed/routed.h"
|
2012-04-06 18:23:13 +04:00
|
|
|
#include "orte/mca/state/state.h"
|
2010-07-18 01:03:27 +04:00
|
|
|
|
|
|
|
#include "orte/util/session_dir.h"
|
|
|
|
#include "orte/util/show_help.h"
|
|
|
|
|
|
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/runtime/orte_quit.h"
|
|
|
|
#include "orte/runtime/orte_locks.h"
|
|
|
|
#include "orte/runtime/orte_data_server.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Globals
|
|
|
|
*/
|
2011-11-23 01:24:35 +04:00
|
|
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
2010-07-18 01:03:27 +04:00
|
|
|
static int num_aborted = 0;
|
|
|
|
static int num_killed = 0;
|
|
|
|
static int num_failed_start = 0;
|
2012-04-06 18:23:13 +04:00
|
|
|
static bool errors_reported = false;
|
2010-07-18 01:03:27 +04:00
|
|
|
|
|
|
|
static void dump_aborted_procs(void);
|
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
void orte_quit(int fd, short args, void *cbdata)
|
2010-07-18 01:03:27 +04:00
|
|
|
{
|
2012-04-06 18:23:13 +04:00
|
|
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
|
|
|
|
|
|
/* cleanup */
|
|
|
|
if (NULL != caddy) {
|
|
|
|
OBJ_RELEASE(caddy);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check one-time lock to protect against "bounce" */
|
|
|
|
if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */
|
2010-07-18 01:03:27 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
/* if we are the hnp and haven't already reported it, then
|
|
|
|
* report any errors
|
2010-07-18 01:03:27 +04:00
|
|
|
*/
|
2012-04-06 18:23:13 +04:00
|
|
|
if (ORTE_PROC_IS_HNP && !errors_reported) {
|
|
|
|
if (0 != orte_exit_status && !orte_execute_quiet) {
|
|
|
|
errors_reported = true;
|
|
|
|
/* abnormal termination of some kind */
|
|
|
|
dump_aborted_procs();
|
|
|
|
/* If we showed more abort messages than were allowed,
|
|
|
|
show a followup message here */
|
|
|
|
if (num_failed_start > 1) {
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "<stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "%d total process%s failed to start",
|
|
|
|
num_failed_start, ((num_failed_start > 1) ? "es" : ""));
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "
</stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "\n");
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
2012-04-06 18:23:13 +04:00
|
|
|
if (num_aborted > 1) {
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "<stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "%d total process%s aborted",
|
|
|
|
num_aborted, ((num_aborted > 1) ? "es" : ""));
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "
</stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "\n");
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
2012-04-06 18:23:13 +04:00
|
|
|
if (num_killed > 1) {
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "<stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
|
|
|
|
num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
|
|
|
|
if (orte_xml_output) {
|
|
|
|
fprintf(orte_xml_fp, "
</stderr>");
|
|
|
|
}
|
|
|
|
fprintf(orte_xml_fp, "\n");
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-03 18:39:10 +03:00
|
|
|
|
2012-04-06 18:23:13 +04:00
|
|
|
/* flag that the event lib should no longer be looped
|
|
|
|
* so we will exit
|
|
|
|
*/
|
|
|
|
orte_event_base_active = false;
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On abnormal termination - dump the
|
|
|
|
* exit status of the aborted procs.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void dump_aborted_procs(void)
|
|
|
|
{
|
|
|
|
orte_std_cntr_t i, n;
|
|
|
|
orte_proc_t *proc, *pptr;
|
|
|
|
orte_app_context_t *app, *approc;
|
|
|
|
orte_job_t *job;
|
|
|
|
orte_node_t *node;
|
|
|
|
|
|
|
|
/* find the job that caused the problem - be sure to start the loop
|
|
|
|
* at 1 as the daemons are in 0 and will clearly be "running", so no
|
|
|
|
* point in checking them
|
|
|
|
*/
|
|
|
|
for (n=1; n < orte_job_data->size; n++) {
|
|
|
|
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
|
|
|
|
/* the array is no longer left-justified, so we have to continue */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (ORTE_JOB_STATE_UNDEF != job->state &&
|
|
|
|
ORTE_JOB_STATE_INIT != job->state &&
|
|
|
|
ORTE_JOB_STATE_RUNNING != job->state &&
|
|
|
|
ORTE_JOB_STATE_TERMINATED != job->state &&
|
|
|
|
ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
|
|
|
|
/* this is a guilty party */
|
|
|
|
proc = job->aborted_proc;
|
|
|
|
/* always must be at least one app */
|
|
|
|
app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0);
|
|
|
|
/* cycle through and count the number that were killed or aborted */
|
|
|
|
for (i=0; i < job->procs->size; i++) {
|
|
|
|
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
|
|
|
|
/* array is left-justfied - we are done */
|
|
|
|
continue;
|
|
|
|
}
|
2012-04-06 18:23:13 +04:00
|
|
|
if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
|
|
|
|
ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
|
2010-07-18 01:03:27 +04:00
|
|
|
++num_failed_start;
|
|
|
|
} else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
|
|
|
|
++num_aborted;
|
|
|
|
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
|
|
|
|
++num_killed;
|
2011-06-30 18:11:56 +04:00
|
|
|
} else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
|
|
|
|
++num_killed;
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
}
|
2011-06-24 00:38:02 +04:00
|
|
|
|
|
|
|
if (NULL == proc) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2010-07-18 01:03:27 +04:00
|
|
|
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
|
|
|
|
node = proc->node;
|
2012-04-06 18:23:13 +04:00
|
|
|
if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
|
|
|
|
ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
|
2010-07-18 01:03:27 +04:00
|
|
|
if (NULL == proc) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true,
|
|
|
|
orte_basename);
|
|
|
|
return;
|
|
|
|
}
|
2011-10-14 22:46:03 +04:00
|
|
|
switch (proc->exit_code) {
|
2012-03-24 04:49:38 +04:00
|
|
|
case ORTE_ERR_SILENT:
|
|
|
|
/* say nothing - it was already reported */
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_SYS_LIMITS_PIPES:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
|
|
|
orte_basename, proc->node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
|
|
|
orte_basename, proc->node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
|
|
|
orte_basename, proc->node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
|
|
|
orte_basename, proc->node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_WDIR_NOT_FOUND:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
|
|
|
orte_basename, approc->cwd,
|
|
|
|
proc->node->name, (unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_EXE_NOT_FOUND:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
|
|
|
orte_basename,
|
|
|
|
(unsigned long)proc->name.vpid,
|
|
|
|
orte_basename,
|
|
|
|
orte_basename,
|
|
|
|
proc->node->name,
|
|
|
|
approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
|
|
|
orte_basename, approc->app, proc->node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:multiple-paffinity-schemes", true, NULL);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:topo-not-supported",
|
|
|
|
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
|
|
|
NULL, approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_INVALID_NODE_RANK:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:invalid-node-rank", true);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_INVALID_LOCAL_RANK:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:invalid-local-rank", true);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_NOT_ENOUGH_CORES:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:not-enough-resources", true,
|
|
|
|
"sockets", node->name,
|
|
|
|
"bind-to-core", approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:topo-not-supported",
|
|
|
|
true, node->name, "bind-to-core", "",
|
|
|
|
approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_INVALID_PHYS_CPU:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:invalid-phys-cpu", true);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:not-enough-resources", true,
|
|
|
|
"sockets", node->name,
|
|
|
|
"bind-to-socket", approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:topo-not-supported",
|
|
|
|
true, node->name, "bind-to-socket", "",
|
|
|
|
approc->app);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_MODULE_NOT_FOUND:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:paffinity-missing-module",
|
|
|
|
true, node->name);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_SLOT_LIST_RANGE:
|
|
|
|
orte_show_help("help-orterun.txt",
|
|
|
|
"orterun:invalid-slot-list-range",
|
|
|
|
true, node->name, NULL);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_PIPE_READ_FAILURE:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
|
|
|
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
|
|
|
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (0 != proc->exit_code) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
2010-07-18 01:03:27 +04:00
|
|
|
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
|
|
|
(unsigned long)proc->name.vpid);
|
2012-03-24 04:49:38 +04:00
|
|
|
} else {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
|
|
|
orte_basename, node->name);
|
|
|
|
}
|
|
|
|
break;
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
|
|
|
if (NULL == proc) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true,
|
|
|
|
orte_basename);
|
|
|
|
} else {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
|
|
|
|
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
|
|
|
node->name, orte_basename);
|
|
|
|
}
|
|
|
|
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
|
|
|
|
if (NULL == proc) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true,
|
|
|
|
orte_basename);
|
|
|
|
} else {
|
|
|
|
#ifdef HAVE_STRSIGNAL
|
|
|
|
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
|
|
|
|
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
|
|
|
node->name, WTERMSIG(proc->exit_code),
|
|
|
|
strsignal(WTERMSIG(proc->exit_code)));
|
|
|
|
} else {
|
|
|
|
#endif
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
|
|
|
|
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
|
|
|
node->name, WTERMSIG(proc->exit_code));
|
|
|
|
#ifdef HAVE_STRSIGNAL
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
|
|
|
if (NULL == proc) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true,
|
|
|
|
orte_basename, orte_basename);
|
|
|
|
} else {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
|
|
|
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
|
|
|
node->name, orte_basename, orte_basename);
|
|
|
|
}
|
|
|
|
} else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
|
2012-03-09 01:55:19 +04:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2010-07-18 01:03:27 +04:00
|
|
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
|
|
|
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
|
|
|
switch (proc->exit_code) {
|
2012-03-24 04:49:38 +04:00
|
|
|
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
|
|
|
ORTE_NAME_PRINT(&proc->name), node->name);
|
|
|
|
break;
|
|
|
|
case ORTE_ERR_PROC_STALLED:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
|
|
|
break;
|
2010-07-18 01:03:27 +04:00
|
|
|
|
2012-03-24 04:49:38 +04:00
|
|
|
default:
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
|
|
|
break;
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
|
|
|
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
2011-04-14 19:04:21 +04:00
|
|
|
} else if (orte_abort_non_zero_exit &&
|
|
|
|
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
|
|
|
|
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
|
|
|
|
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
|
2010-07-18 01:03:27 +04:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-08-18 01:51:38 +04:00
|
|
|
#endif
|