Add a new test that checks behavior when we call exit with a non-zero return code after calling finalize - don't ask why.
Modify the check_complete code so it finds the first non-zero exit status (i.e., the one from the lowest rank) in a job that terminates normally, and sets the mpirun exit code to that status. This commit was SVN r23071.
Этот коммит содержится в:
родитель
71cbe1a69f
Коммит
c62418d76d
@ -580,6 +580,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
orte_job_map_t *map;
|
||||
orte_std_cntr_t index;
|
||||
bool one_still_alive;
|
||||
orte_exit_code_t first_non_zero=0;
|
||||
|
||||
#if 0
|
||||
/* Check if FileM is active. If so then keep processing. */
|
||||
@ -594,6 +595,10 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (0 == first_non_zero && 0 != proc->exit_code) {
|
||||
first_non_zero = proc->exit_code;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine how the process state affects the job state
|
||||
*/
|
||||
@ -727,6 +732,8 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
/* turn off any sensor monitors on this job */
|
||||
orte_sensor.stop(jdata->jobid);
|
||||
#endif
|
||||
/* update our exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(first_non_zero);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
47
orte/test/system/orte_exit.c
Обычный файл
47
orte/test/system/orte_exit.c
Обычный файл
@ -0,0 +1,47 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A program that just spins, with vpid 3 aborting - provides mechanism for testing
|
||||
* abnormal program termination
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i, rc;
|
||||
double pi;
|
||||
pid_t pid;
|
||||
char hostname[500];
|
||||
|
||||
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
|
||||
fprintf(stderr, "orte_abort: couldn't init orte - error code %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
pid = getpid();
|
||||
gethostname(hostname, 500);
|
||||
|
||||
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
hostname, (long)pid);
|
||||
|
||||
i = 0;
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i == 9995) {
|
||||
orte_finalize();
|
||||
exit(ORTE_PROC_MY_NAME->vpid);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Загрузка…
Ссылка в новой задаче
Block a user