1
1

Modify orterun to provide more user-friendly reporting on jobs that fail to start

This commit was SVN r14496.
Этот коммит содержится в:
Ralph Castain 2007-04-24 19:19:14 +00:00
родитель 19767802de
Коммит c774f641fb
2 изменённых файлов: 228 добавлений и 54 удалений

Просмотреть файл

@ -86,6 +86,13 @@ Returned value %d instead of ORTE_SUCCESS.
[orterun:attr-failed]
%s was unable to define an attribute
Returned value %d instead of ORTE_SUCCESS.
#
[orterun:proc-ordered-abort]
%s has exited due to process rank %lu with PID %lu on
node %s calling "abort". This will have caused other processes
in the application to be terminated by signals sent by %s
(as reported here).
#
[orterun:proc-aborted]
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
[orterun:proc-aborted-strsignal]
@ -137,4 +144,93 @@ Things to check:
#
[orterun:daemon-die]
%s was unable to cleanly terminate the daemons for this job. Returned value %s instead of ORTE_SUCCESS.
#
[orterun:daemon-die]
%s was unable to cleanly terminate the daemons for this job. Returned value %s instead of ORTE_SUCCESS.
#
[orterun:sys-limit-pipe]
%s was unable to launch the specified application as it encountered an error:
Error: system limit exceeded on number of pipes that can be open
Node: %s
when attempting to start process rank %lu.
This can be resolved by either asking the system administrator for that node to
increase the system limit, or by rearranging your processes to place fewer of them
on that node.
#
[orterun:pipe-setup-failure]
%s was unable to launch the specified application as it encountered an error:
Error: pipe function call failed when setting up I/O forwarding subsystem
Node: %s
while attempting to start process rank %lu.
#
[orterun:sys-limit-children]
%s was unable to launch the specified application as it encountered an error:
Error: system limit exceeded on number of processes that can be started
Node: %s
when attempting to start process rank %lu.
This can be resolved by either asking the system administrator for that node to
increase the system limit, or by rearranging your processes to place fewer of them
on that node.
#
[orterun:failed-term-attrs]
%s was unable to launch the specified application as it encountered an error:
Error: reading tty attributes function call failed while setting up I/O forwarding system
Node: %s
while attempting to start process rank %lu.
#
[orterun:wdir-not-found]
%s was unable to launch the specified application as it could not change to the
specified working directory:
Working directory: %s
Node: %s
while attempting to start process rank %lu.
#
[orterun:exe-not-found]
%s was unable to launch the specified application as it could not find an executable:
Executable: %s
Node: %s
while attempting to start process rank %lu.
#
[orterun:exe-not-accessible]
%s was unable to launch the specified application as it could not access
or execute an executable:
Executable: %s
Node: %s
while attempting to start process rank %lu.
#
[orterun:pipe-read-failure]
%s was unable to launch the specified application as it encountered an error:
Error: reading from a pipe function call failed while spawning a local process
Node: %s
while attempting to start process rank %lu.
#
[orterun:proc-failed-to-start]
%s was unable to start the specified application as it encountered an error:
Error name: %s
Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-failed-to-start-no-status]
%s was unable to start the specified application as it encountered an error
on node %s. More information may be available above.

Просмотреть файл

@ -70,6 +70,7 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rmgr/rmgr.h"
@ -326,7 +327,7 @@ static int parse_globals(int argc, char* argv[]);
static int parse_locals(int argc, char* argv[]);
static int parse_appfile(char *filename, char ***env);
static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state);
static void dump_aborted_procs(orte_jobid_t jobid);
static void dump_aborted_procs(orte_jobid_t jobid, orte_app_context_t **apps, orte_job_state_t state);
int orterun(int argc, char *argv[])
@ -506,7 +507,7 @@ int orterun(int argc, char *argv[])
}
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
/* abnormal termination of some kind */
dump_aborted_procs(jobid);
dump_aborted_procs(jobid, apps, exit_state);
/* If we showed more abort messages than were allowed,
show a followup message here */
if (num_aborted > max_display_aborted) {
@ -522,6 +523,9 @@ int orterun(int argc, char *argv[])
/* Make sure we propagate the exit code */
if (WIFEXITED(orterun_globals.exit_status)) {
rc = WEXITSTATUS(orterun_globals.exit_status);
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state) {
/* ensure we don't treat this like a signal */
rc = orterun_globals.exit_status;
} else {
/* If a process was killed by a signal, then make the
* exit code of orterun be "signo + 128" so that "prog"
@ -531,21 +535,16 @@ int orterun(int argc, char *argv[])
}
/* the job is complete - now tell the orteds that it is
* okay to finalize and exit, we are done with them
* be sure to include any descendants so nothing is
* left hanging
* okay to finalize and exit, we are done with them.
* Issue this as a "soft kill" so the daemons won't die
* if they are part of a virtual machine - since that is
* the default mode, we can just leave the attributes as NULL
*/
if (ORTE_JOBID_INVALID != jobid) {
OBJ_CONSTRUCT(&attributes, opal_list_t);
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(&orte_abort_timeout, NULL))) {
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
orterun_basename, ORTE_ERROR_NAME(ret));
}
while (NULL != (item = opal_list_remove_first(&attributes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&attributes);
}
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
@ -575,7 +574,7 @@ DONE:
* exit status of the aborted procs.
*/
static void dump_aborted_procs(orte_jobid_t jobid)
static void dump_aborted_procs(orte_jobid_t jobid, orte_app_context_t **apps, orte_job_state_t state)
{
char *segment;
orte_gpr_value_t** values = NULL;
@ -583,46 +582,49 @@ static void dump_aborted_procs(orte_jobid_t jobid)
int rc;
int32_t exit_status = 0;
bool exit_status_set;
bool abort_reported=false;
char *keys[] = {
ORTE_PROC_NAME_KEY,
ORTE_PROC_LOCAL_PID_KEY,
ORTE_PROC_RANK_KEY,
ORTE_PROC_EXIT_CODE_KEY,
ORTE_NODE_NAME_KEY,
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_PROC_STATE_KEY,
NULL
};
OPAL_TRACE_ARG1(1, jobid);
/* query the job segment on the registry */
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_gpr.get(
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
keys,
&num_values,
&values
);
rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
keys,
&num_values,
&values
);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
free(segment);
return;
}
for (i = 0; i < num_values; i++) {
orte_gpr_value_t* value = values[i];
orte_process_name_t name, *nptr;
pid_t pid = 0, *pidptr;
orte_std_cntr_t rank = 0, *sptr;
orte_std_cntr_t rank = 0, *sptr, app_idx=0;
bool rank_found=false;
char* node_name = NULL;
orte_exit_code_t *ecptr;
orte_proc_state_t *pst_ptr, pst;
exit_status = 0;
exit_status_set = false;
for(k=0; k < value->cnt; k++) {
@ -665,44 +667,120 @@ static void dump_aborted_procs(orte_jobid_t jobid)
node_name = (char*)(keyval->value->data);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
continue;
}
app_idx = *sptr;
continue;
}
if(strcmp(keyval->key, ORTE_PROC_STATE_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pst_ptr, keyval->value, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
continue;
}
pst = *pst_ptr;
continue;
}
}
if (rank_found) {
if (WIFSIGNALED(exit_status)) {
if (9 == WTERMSIG(exit_status)) {
++num_killed;
} else {
if (num_aborted < max_display_aborted) {
#ifdef HAVE_STRSIGNAL
if (NULL != strsignal(WTERMSIG(exit_status))) {
opal_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", false,
if (ORTE_JOB_STATE_FAILED_TO_START == state) {
if (num_aborted < max_display_aborted) {
if (ORTE_ERR_SYS_LIMITS_PIPES == exit_status) {
opal_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
orterun_basename, node_name, (unsigned long)rank);
} else if (ORTE_ERR_PIPE_SETUP_FAILURE == exit_status) {
opal_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
orterun_basename, node_name, (unsigned long)rank);
} else if (ORTE_ERR_SYS_LIMITS_CHILDREN == exit_status) {
opal_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
orterun_basename, node_name, (unsigned long)rank);
} else if (ORTE_ERR_FAILED_GET_TERM_ATTRS == exit_status) {
opal_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
orterun_basename, node_name, (unsigned long)rank);
} else if (ORTE_ERR_WDIR_NOT_FOUND == exit_status) {
opal_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
orterun_basename, apps[app_idx]->cwd, node_name, (unsigned long)rank);
} else if (ORTE_ERR_EXE_NOT_FOUND == exit_status) {
opal_show_help("help-orterun.txt", "orterun:exe-not-found", true,
orterun_basename, apps[app_idx]->app, node_name, (unsigned long)rank);
} else if (ORTE_ERR_EXE_NOT_ACCESSIBLE == exit_status) {
opal_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
orterun_basename, apps[app_idx]->app, node_name, (unsigned long)rank);
} else if (ORTE_ERR_PIPE_READ_FAILURE == exit_status) {
opal_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
orterun_basename, node_name, (unsigned long)rank);
} else if (0 != exit_status) {
opal_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orterun_basename, ORTE_ERROR_NAME(exit_status), node_name,
(unsigned long)rank);
} else {
opal_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orterun_basename, node_name);
}
}
++num_aborted;
} else {
if (ORTE_PROC_STATE_ABORTED == pst) {
if (!abort_reported) {
opal_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
orterun_basename, (unsigned long)rank, (unsigned long)pid,
node_name, WTERMSIG(exit_status),
strsignal(WTERMSIG(exit_status)));
} else {
#endif
opal_show_help("help-orterun.txt", "orterun:proc-aborted", false,
orterun_basename, (unsigned long)rank, (unsigned long)pid,
node_name, WTERMSIG(exit_status));
#ifdef HAVE_STRSIGNAL
}
#endif
node_name, orterun_basename);
abort_reported = true;
}
++num_aborted;
} else if (WIFSIGNALED(exit_status)) {
if (9 == WTERMSIG(exit_status)) {
++num_killed;
} else {
if (num_aborted < max_display_aborted) {
#ifdef HAVE_STRSIGNAL
if (NULL != strsignal(WTERMSIG(exit_status))) {
opal_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", false,
orterun_basename, (unsigned long)rank, (unsigned long)pid,
node_name, WTERMSIG(exit_status),
strsignal(WTERMSIG(exit_status)));
} else {
#endif
opal_show_help("help-orterun.txt", "orterun:proc-aborted", false,
orterun_basename, (unsigned long)rank, (unsigned long)pid,
node_name, WTERMSIG(exit_status));
#ifdef HAVE_STRSIGNAL
}
#endif
}
++num_aborted;
}
}
}
}
/* If we haven't done so already, hold the exit_status so we
can return it when exiting. Specifically, keep the first
non-zero entry. If they all return zero, we'll return
zero. We already have the globals.lock (from
job_state_callback), so don't try to get it again. */
if (0 == orterun_globals.exit_status && exit_status_set) {
can return it when exiting. Specifically, keep the first
non-zero entry. If they all return zero, we'll return
zero. We already have the globals.lock (from
job_state_callback), so don't try to get it again. */
if (ORTE_JOB_STATE_FAILED_TO_START == state) {
/* if the job failed to start, then there cannot be
* an exit state set, so we force the exit state
* to be 1 so that scripts can tell we failed. Keep
* this BEFORE the exit_status_set "if" so that we
* can detect some procs failing to start while
* others did.
*
* Any exit state we find is actually just the ORTE error
* code we set so that orterun can output an intelligible
* error message. Hence, there is no sense in trying to
* propagate any reported exit states - just set it to "1"
*/
orterun_globals.exit_status = 1;
} else if (0 == orterun_globals.exit_status && exit_status_set) {
orterun_globals.exit_status = exit_status;
}
OBJ_RELEASE(value);
}
if (NULL != values) {