Bring some sanity to the exit code returned by mpirun. Ensure that we provide a non-zero code if something goes wrong, including someone exiting after calling mpi_init without calling mpi_finalize.
Jeff is preparing an (undoubtedly lengthy) explanation/matrix of how these codes are determined for the OMPI FAQ. This commit was SVN r17879.
Этот коммит содержится в:
родитель
80ac7c87cd
Коммит
2ed0e60321
@ -83,7 +83,8 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
break;
|
||||
}
|
||||
if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state) {
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC != jobs[i]->state) {
|
||||
jobs[i]->state = ORTE_JOB_STATE_ABORT_ORDERED;
|
||||
}
|
||||
}
|
||||
@ -93,8 +94,10 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* wakeup orterun so we can exit */
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(exit_code))) {
|
||||
/* wakeup orterun so we can exit - the appropriate exit status
|
||||
* for orterun will have been set by whomever called us
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
@ -130,8 +133,10 @@ void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* wakeup orterun so we can exit */
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(exit_code))) {
|
||||
/* wakeup orterun so we can exit - the appropriate exit status
|
||||
* for orterun will have been set by whomever called us
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
@ -189,14 +189,16 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
|
||||
/* get the job data object for this proc */
|
||||
if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
orte_wakeup(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_wakeup();
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* get the proc object for it */
|
||||
procs = (orte_proc_t**)jdata->procs->addr;
|
||||
if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
orte_wakeup(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_wakeup();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
@ -205,7 +207,8 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_wakeup(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_wakeup();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
@ -290,12 +293,14 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_wakeup(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_wakeup();
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_wakeup(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_wakeup();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
|
@ -1521,7 +1521,7 @@ GOTCHILD:
|
||||
* is considered an abnormal termination and treated accordingly
|
||||
*/
|
||||
aborted = true;
|
||||
child->state = ORTE_PROC_STATE_ABORTED;
|
||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:wait_local_proc child process %s terminated normally "
|
||||
|
@ -229,8 +229,9 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
|
||||
jdata->state = state;
|
||||
|
||||
WAKEUP:
|
||||
/* wakeup so orterun can exit */
|
||||
orte_wakeup(status);
|
||||
/* set orterun's exit code and wakeup so it can exit */
|
||||
ORTE_UPDATE_EXIT_STATUS(status);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
|
||||
@ -737,6 +738,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = procs[i];
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_ABORTED == procs[i]->state) {
|
||||
@ -745,6 +747,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = procs[i];
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == procs[i]->state) {
|
||||
@ -753,9 +756,28 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = procs[i];
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == procs[i]->state) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
||||
if (!jdata->abort) {
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = procs[i];
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(procs[i]->exit_code);
|
||||
/* now treat a special case - if the proc exit'd without a required
|
||||
* sync, it may have done so with a zero exit code. We want to ensure
|
||||
* that the user realizes there was an error, so in this -one- case,
|
||||
* we overwrite the process' exit code with a '1'
|
||||
*/
|
||||
if (ORTE_PROC_STATE_TERM_WO_SYNC == procs[i]->state) {
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -771,12 +793,14 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
|
||||
goto CHECK_ALL_JOBS;
|
||||
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state) {
|
||||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:check_job_completed declared job %s aborted by proc %s",
|
||||
"%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&(jdata->aborted_proc->name))));
|
||||
ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
|
||||
jdata->aborted_proc->exit_code));
|
||||
/* report this to the errmgr */
|
||||
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
|
||||
goto CHECK_ALL_JOBS;
|
||||
@ -849,7 +873,7 @@ CHECK_ALL_JOBS:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:check_job_completed all jobs terminated - waking up",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_wakeup(0);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -219,9 +219,9 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got update_proc_state for vpid %lu state %lu exit_code %d",
|
||||
"%s plm:base:receive got update_proc_state for vpid %lu state %x exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)vpid, (unsigned long)state, (int)exit_code));
|
||||
(unsigned long)vpid, (unsigned int)state, (int)exit_code));
|
||||
|
||||
/* update the termination counter IFF the state is changing to something
|
||||
* indicating terminated
|
||||
@ -233,6 +233,10 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
/* update the data */
|
||||
procs[vpid]->state = state;
|
||||
procs[vpid]->exit_code = exit_code;
|
||||
|
||||
/* update orte's exit status if it is non-zero */
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
}
|
||||
count = 1;
|
||||
}
|
||||
@ -261,7 +265,7 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
|
||||
/* see if an error occurred - if so, wakeup so we can exit */
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
orte_wakeup(1);
|
||||
orte_wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -410,7 +410,7 @@ cleanup:
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
orte_wakeup(1);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -495,7 +495,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
if (NULL == (jdata = orte_get_job_data_object(active_job))) {
|
||||
/* bad jobid */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
orte_wakeup(status);
|
||||
orte_wakeup();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -522,7 +522,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
|
||||
orte_wakeup(status);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
|
||||
|
@ -59,6 +59,7 @@ typedef uint16_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_ABORTED 0x0100 /* process aborted */
|
||||
#define ORTE_PROC_STATE_FAILED_TO_START 0x0200 /* process failed to start */
|
||||
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */
|
||||
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */
|
||||
|
||||
/*
|
||||
* Job state codes
|
||||
@ -82,9 +83,10 @@ typedef uint16_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_ABORTED 0x0100 /* at least one process aborted, causing job to abort */
|
||||
#define ORTE_JOB_STATE_FAILED_TO_START 0x0200 /* at least one process failed to start */
|
||||
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
|
||||
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
|
||||
|
||||
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
|
||||
#define ORTE_JOB_STATE_ABORT_ORDERED 0x0800
|
||||
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
|
||||
|
||||
/**
|
||||
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
||||
|
@ -401,7 +401,7 @@ cleanup:
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
orte_wakeup(1);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -486,7 +486,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
if (NULL == (jdata = orte_get_job_data_object(active_job))) {
|
||||
/* bad jobid */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
orte_wakeup(status);
|
||||
orte_wakeup();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -513,7 +513,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
|
||||
orte_wakeup(status);
|
||||
orte_wakeup();
|
||||
}
|
||||
|
||||
|
||||
|
@ -169,6 +169,7 @@ opal_output(0, "checking job completion");
|
||||
procs = (orte_proc_t**)active_jdata->procs->addr;
|
||||
for (v=0; v < active_jdata->num_procs; v++) {
|
||||
procs[v]->exit_code = exit_codes[v];
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_codes[v]);
|
||||
opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
|
||||
if (WIFEXITED(exit_codes[v])) {
|
||||
if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
@ -194,7 +195,7 @@ cleanup:
|
||||
/* check for completion */
|
||||
if (active_jdata->num_terminated >= active_jdata->num_procs) {
|
||||
active_jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
orte_wakeup(0);
|
||||
orte_wakeup();
|
||||
} else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
|
||||
!orte_abnormal_term_ordered && !orte_abort_in_progress) {
|
||||
orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
|
||||
|
@ -446,7 +446,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
* NOTE: this event will fire -after- any zero-time events
|
||||
* so any pending relays -do- get sent first
|
||||
*/
|
||||
orte_wakeup(0);
|
||||
orte_wakeup();
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
@ -460,7 +460,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
* NOTE: this event will fire -after- any zero-time events
|
||||
* so any pending relays -do- get sent first
|
||||
*/
|
||||
orte_wakeup(0);
|
||||
orte_wakeup();
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
|
@ -35,7 +35,7 @@
|
||||
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
|
||||
int orte_wakeup(int exit_status)
|
||||
int orte_wakeup(void)
|
||||
{
|
||||
/* set the exit status and trigger the
|
||||
* exit procedure
|
||||
@ -44,7 +44,6 @@ int orte_wakeup(int exit_status)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_exit_status = exit_status;
|
||||
orte_trigger_event(orte_exit);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -27,12 +27,47 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/**
|
||||
* Define a macro for updating the orte_exit_status
|
||||
* The macro provides a convenient way of doing this
|
||||
* so that we can add thread locking at some point
|
||||
* since the orte_exit_status is a global variable.
|
||||
*
|
||||
* Ensure that we do not overwrite the exit status if it has
|
||||
* already been set to some non-zero value. If we don't make
|
||||
* this check, then different parts of the code could overwrite
|
||||
* each other's exit status in the case of abnormal termination.
|
||||
*
|
||||
* For example, if a process aborts, we would record the initial
|
||||
* exit code from the aborted process. However, subsequent processes
|
||||
* will have been aborted by signal as we kill the job. We don't want
|
||||
* the subsequent processes to overwrite the original exit code so
|
||||
* we can tell the user the exit code from the process that caused
|
||||
* the whole thing to happen.
|
||||
*/
|
||||
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
|
||||
do { \
|
||||
if (0 == orte_exit_status && 0 != newstatus) { \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"%s:%s(%d) updating exit status to %d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, newstatus)); \
|
||||
orte_exit_status = newstatus; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/**
|
||||
* Wakeup orterun by reporting the termination of all processes
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_wakeup(int exit_status);
|
||||
ORTE_DECLSPEC int orte_wakeup(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
29
orte/test/mpi/segv.c
Обычный файл
29
orte/test/mpi/segv.c
Обычный файл
@ -0,0 +1,29 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "mpi.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int rank, size;
|
||||
char *foo=0;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
|
||||
printf("Hello, World, I am %d of %d\n", rank, size);
|
||||
|
||||
if (1 == rank) {
|
||||
sleep(2);
|
||||
*foo = 42;
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
@ -93,6 +93,18 @@ node %s calling "abort". This may have caused other processes
|
||||
in the application to be terminated by signals sent by %s
|
||||
(as reported here).
|
||||
#
|
||||
[orterun:proc-exit-no-sync]
|
||||
%s has exited due to process rank %lu with PID %lu on
|
||||
node %s exiting without calling "finalize". This may
|
||||
have caused other processes in the application to be
|
||||
terminated by signals sent by %s (as reported here).
|
||||
#
|
||||
[orterun:proc-exit-no-sync-unknown]
|
||||
%s has exited due to a process exiting without calling "finalize",
|
||||
but has no info as to the process that caused that situation. This
|
||||
may have caused other processes in the application to be
|
||||
terminated by signals sent by %s (as reported here).
|
||||
#
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that process rank %lu with PID %lu on node %s exited on signal %d.
|
||||
#
|
||||
|
@ -511,7 +511,7 @@ int orterun(int argc, char *argv[])
|
||||
#endif /* __WINDOWS__ */
|
||||
orte_totalview_init_before_spawn();
|
||||
|
||||
/* setup an event we can wait for to tell
|
||||
/* setup an event we can wait for that will tell
|
||||
* us to terminate - both normal and abnormal
|
||||
* termination will call us here. Use the
|
||||
* same exit fd as the daemon does so that orted_comm
|
||||
@ -520,7 +520,7 @@ int orterun(int argc, char *argv[])
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, job_completed))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:event-def-failed", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(rc));
|
||||
orte_exit_status = ORTE_ERROR_DEFAULT_EXIT_CODE;
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
@ -583,7 +583,8 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
/* Make sure we propagate the exit code */
|
||||
if (WIFEXITED(orte_exit_status)) {
|
||||
orte_exit_status = WEXITSTATUS(orte_exit_status);
|
||||
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state) {
|
||||
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state ||
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC == exit_state) {
|
||||
/* ensure we don't treat this like a signal */
|
||||
} else {
|
||||
/* If a process was killed by a signal, then make the
|
||||
@ -836,6 +837,15 @@ static void dump_aborted_procs(void)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
||||
if (NULL == proc) {
|
||||
opal_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true,
|
||||
orterun_basename, orterun_basename);
|
||||
} else {
|
||||
opal_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
||||
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
proc->node->name, orterun_basename);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -899,7 +909,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
/* If we failed the terminate_job() above, then we
|
||||
* need to explicitly wake ourselves up to exit
|
||||
*/
|
||||
orte_wakeup(ret);
|
||||
ORTE_UPDATE_EXIT_STATUS(ret);
|
||||
orte_wakeup();
|
||||
}
|
||||
} else {
|
||||
/* if the jobid is invalid, then we didn't get to
|
||||
@ -916,7 +927,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
exit(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user