1
1

Bring some sanity to the exit code returned by mpirun. Ensure that we provide a non-zero code if something goes wrong, including someone exiting after calling mpi_init without calling mpi_finalize.

Jeff is preparing an (undoubtedly lengthy) explanation/matrix of how these codes are determined for the OMPI FAQ.

This commit was SVN r17879.
Этот коммит содержится в:
Ralph Castain 2008-03-19 19:00:51 +00:00
родитель 80ac7c87cd
Коммит 2ed0e60321
16 изменённых файлов: 167 добавлений и 39 удалений

Просмотреть файл

@ -83,7 +83,8 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
break;
}
if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state) {
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&
ORTE_JOB_STATE_ABORTED_WO_SYNC != jobs[i]->state) {
jobs[i]->state = ORTE_JOB_STATE_ABORT_ORDERED;
}
}
@ -93,8 +94,10 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
ORTE_ERROR_LOG(rc);
}
/* wakeup orterun so we can exit */
if (ORTE_SUCCESS != (rc = orte_wakeup(exit_code))) {
/* wakeup orterun so we can exit - the appropriate exit status
* for orterun will have been set by whomever called us
*/
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
ORTE_ERROR_LOG(rc);
}
}
@ -130,8 +133,10 @@ void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
ORTE_ERROR_LOG(rc);
}
/* wakeup orterun so we can exit */
if (ORTE_SUCCESS != (rc = orte_wakeup(exit_code))) {
/* wakeup orterun so we can exit - the appropriate exit status
* for orterun will have been set by whomever called us
*/
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -189,14 +189,16 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
/* get the job data object for this proc */
if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_wakeup(1);
ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup();
goto CLEANUP;
}
/* get the proc object for it */
procs = (orte_proc_t**)jdata->procs->addr;
if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_wakeup(1);
ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup();
goto CLEANUP;
}
@ -205,7 +207,8 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
orte_wakeup(1);
ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup();
goto CLEANUP;
}
@ -290,12 +293,14 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
orte_wakeup(1);
ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup();
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
orte_wakeup(1);
ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup();
goto CLEANUP;
}

Просмотреть файл

@ -1521,7 +1521,7 @@ GOTCHILD:
* is considered an abnormal termination and treated accordingly
*/
aborted = true;
child->state = ORTE_PROC_STATE_ABORTED;
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %s terminated normally "

Просмотреть файл

@ -229,8 +229,9 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
jdata->state = state;
WAKEUP:
/* wakeup so orterun can exit */
orte_wakeup(status);
/* set orterun's exit code and wakeup so it can exit */
ORTE_UPDATE_EXIT_STATUS(status);
orte_wakeup();
}
@ -737,6 +738,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = procs[i];
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
}
break;
} else if (ORTE_PROC_STATE_ABORTED == procs[i]->state) {
@ -745,6 +747,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = procs[i];
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
}
break;
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == procs[i]->state) {
@ -753,9 +756,28 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = procs[i];
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
}
break;
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == procs[i]->state) {
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = procs[i];
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
/* now treat a special case - if the proc exit'd without a required
* sync, it may have done so with a zero exit code. We want to ensure
* that the user realizes there was an error, so in this -one- case,
* we overwrite the process' exit code with a '1'
*/
if (ORTE_PROC_STATE_TERM_WO_SYNC == procs[i]->state) {
ORTE_UPDATE_EXIT_STATUS(1);
}
}
break;
}
}
}
@ -771,12 +793,14 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
goto CHECK_ALL_JOBS;
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state) {
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed declared job %s aborted by proc %s",
"%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&(jdata->aborted_proc->name))));
ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
jdata->aborted_proc->exit_code));
/* report this to the errmgr */
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
goto CHECK_ALL_JOBS;
@ -849,7 +873,7 @@ CHECK_ALL_JOBS:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_wakeup(0);
orte_wakeup();
}
}

Просмотреть файл

@ -219,9 +219,9 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive got update_proc_state for vpid %lu state %lu exit_code %d",
"%s plm:base:receive got update_proc_state for vpid %lu state %x exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)vpid, (unsigned long)state, (int)exit_code));
(unsigned long)vpid, (unsigned int)state, (int)exit_code));
/* update the termination counter IFF the state is changing to something
* indicating terminated
@ -233,6 +233,10 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
/* update the data */
procs[vpid]->state = state;
procs[vpid]->exit_code = exit_code;
/* update orte's exit status if it is non-zero */
ORTE_UPDATE_EXIT_STATUS(exit_code);
}
count = 1;
}
@ -261,7 +265,7 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
/* see if an error occurred - if so, wakeup so we can exit */
if (ORTE_SUCCESS != rc) {
orte_wakeup(1);
orte_wakeup();
}
}

Просмотреть файл

@ -410,7 +410,7 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
orte_wakeup(1);
orte_wakeup();
}
return rc;
@ -495,7 +495,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
if (NULL == (jdata = orte_get_job_data_object(active_job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
orte_wakeup(status);
orte_wakeup();
return;
}
@ -522,7 +522,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
jdata->state = ORTE_JOB_STATE_TERMINATED;
}
orte_wakeup(status);
orte_wakeup();
}

Просмотреть файл

@ -59,6 +59,7 @@ typedef uint16_t orte_proc_state_t;
#define ORTE_PROC_STATE_ABORTED 0x0100 /* process aborted */
#define ORTE_PROC_STATE_FAILED_TO_START 0x0200 /* process failed to start */
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */
/*
* Job state codes
@ -82,9 +83,10 @@ typedef uint16_t orte_job_state_t;
#define ORTE_JOB_STATE_ABORTED 0x0100 /* at least one process aborted, causing job to abort */
#define ORTE_JOB_STATE_FAILED_TO_START 0x0200 /* at least one process failed to start */
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
#define ORTE_JOB_STATE_ABORT_ORDERED 0x0800
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
/**
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,

Просмотреть файл

@ -401,7 +401,7 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
orte_wakeup(1);
orte_wakeup();
}
return rc;
@ -486,7 +486,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
if (NULL == (jdata = orte_get_job_data_object(active_job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
orte_wakeup(status);
orte_wakeup();
return;
}
@ -513,7 +513,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
jdata->state = ORTE_JOB_STATE_TERMINATED;
}
orte_wakeup(status);
orte_wakeup();
}

Просмотреть файл

@ -169,6 +169,7 @@ opal_output(0, "checking job completion");
procs = (orte_proc_t**)active_jdata->procs->addr;
for (v=0; v < active_jdata->num_procs; v++) {
procs[v]->exit_code = exit_codes[v];
ORTE_UPDATE_EXIT_STATUS(exit_codes[v]);
opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
if (WIFEXITED(exit_codes[v])) {
if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
@ -194,7 +195,7 @@ cleanup:
/* check for completion */
if (active_jdata->num_terminated >= active_jdata->num_procs) {
active_jdata->state = ORTE_JOB_STATE_TERMINATED;
orte_wakeup(0);
orte_wakeup();
} else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
!orte_abnormal_term_ordered && !orte_abort_in_progress) {
orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),

Просмотреть файл

@ -446,7 +446,7 @@ static int process_commands(orte_process_name_t* sender,
* NOTE: this event will fire -after- any zero-time events
* so any pending relays -do- get sent first
*/
orte_wakeup(0);
orte_wakeup();
return ORTE_SUCCESS;
break;
@ -460,7 +460,7 @@ static int process_commands(orte_process_name_t* sender,
* NOTE: this event will fire -after- any zero-time events
* so any pending relays -do- get sent first
*/
orte_wakeup(0);
orte_wakeup();
return ORTE_SUCCESS;
break;

Просмотреть файл

@ -35,7 +35,7 @@
#include "orte/runtime/orte_wakeup.h"
int orte_wakeup(int exit_status)
int orte_wakeup(void)
{
/* set the exit status and trigger the
* exit procedure
@ -44,7 +44,6 @@ int orte_wakeup(int exit_status)
return ORTE_SUCCESS;
}
orte_exit_status = exit_status;
orte_trigger_event(orte_exit);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -27,12 +27,47 @@
#include "orte_config.h"
#include "orte/types.h"
#include "opal/util/output.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
BEGIN_C_DECLS
/**
* Define a macro for updating the orte_exit_status
* The macro provides a convenient way of doing this
* so that we can add thread locking at some point
* since the orte_exit_status is a global variable.
*
* Ensure that we do not overwrite the exit status if it has
* already been set to some non-zero value. If we don't make
* this check, then different parts of the code could overwrite
* each other's exit status in the case of abnormal termination.
*
* For example, if a process aborts, we would record the initial
* exit code from the aborted process. However, subsequent processes
* will have been aborted by signal as we kill the job. We don't want
* the subsequent processes to overwrite the original exit code so
* we can tell the user the exit code from the process that caused
* the whole thing to happen.
*/
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
do { \
if (0 == orte_exit_status && 0 != newstatus) { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"%s:%s(%d) updating exit status to %d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, newstatus)); \
orte_exit_status = newstatus; \
} \
} while(0);
/**
* Wakeup orterun by reporting the termination of all processes
*/
ORTE_DECLSPEC int orte_wakeup(int exit_status);
ORTE_DECLSPEC int orte_wakeup(void);
END_C_DECLS

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv
all: $(PROGS)

29
orte/test/mpi/segv.c Обычный файл
Просмотреть файл

@ -0,0 +1,29 @@
/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/
#include <stdio.h>
#include "mpi.h"
int main(int argc, char* argv[])
{
int rank, size;
char *foo=0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
printf("Hello, World, I am %d of %d\n", rank, size);
if (1 == rank) {
sleep(2);
*foo = 42;
}
MPI_Finalize();
return 0;
}

Просмотреть файл

@ -93,6 +93,18 @@ node %s calling "abort". This may have caused other processes
in the application to be terminated by signals sent by %s
(as reported here).
#
[orterun:proc-exit-no-sync]
%s has exited due to process rank %lu with PID %lu on
node %s exiting without calling "finalize". This may
have caused other processes in the application to be
terminated by signals sent by %s (as reported here).
#
[orterun:proc-exit-no-sync-unknown]
%s has exited due to a process exiting without calling "finalize",
but has no info as to the process that caused that situation. This
may have caused other processes in the application to be
terminated by signals sent by %s (as reported here).
#
[orterun:proc-aborted]
%s noticed that process rank %lu with PID %lu on node %s exited on signal %d.
#

Просмотреть файл

@ -511,7 +511,7 @@ int orterun(int argc, char *argv[])
#endif /* __WINDOWS__ */
orte_totalview_init_before_spawn();
/* setup an event we can wait for to tell
/* setup an event we can wait for that will tell
* us to terminate - both normal and abnormal
* termination will call us here. Use the
* same exit fd as the daemon does so that orted_comm
@ -520,7 +520,7 @@ int orterun(int argc, char *argv[])
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, job_completed))) {
opal_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
orte_exit_status = ORTE_ERROR_DEFAULT_EXIT_CODE;
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
@ -583,7 +583,8 @@ static void job_completed(int trigpipe, short event, void *arg)
/* Make sure we propagate the exit code */
if (WIFEXITED(orte_exit_status)) {
orte_exit_status = WEXITSTATUS(orte_exit_status);
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state) {
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == exit_state) {
/* ensure we don't treat this like a signal */
} else {
/* If a process was killed by a signal, then make the
@ -836,6 +837,15 @@ static void dump_aborted_procs(void)
}
#endif
}
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
if (NULL == proc) {
opal_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true,
orterun_basename, orterun_basename);
} else {
opal_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
proc->node->name, orterun_basename);
}
}
return;
}
@ -899,7 +909,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
/* If we failed the terminate_job() above, then we
* need to explicitly wake ourselves up to exit
*/
orte_wakeup(ret);
ORTE_UPDATE_EXIT_STATUS(ret);
orte_wakeup();
}
} else {
/* if the jobid is invalid, then we didn't get to
@ -916,7 +927,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
orte_finalize();
free(orterun_basename);
exit(1);
ORTE_UPDATE_EXIT_STATUS(1);
exit(orte_exit_status);
}
}