1
1

Add a new test that checks behavior when we call exit with a non-zero return code after calling finalize - don't ask why.

Modify the check_complete code so it finds the first non-zero exit status (i.e., the one from the lowest rank) in a job that terminates normally, and sets the mpirun exit code to that status.

This commit was SVN r23071.
Этот коммит содержится в:
Ralph Castain 2010-04-29 19:58:44 +00:00
родитель 71cbe1a69f
Коммит c62418d76d
3 изменённых файлов: 55 добавлений и 1 удалений

Просмотреть файл

@ -580,6 +580,7 @@ static void check_job_complete(orte_job_t *jdata)
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
orte_exit_code_t first_non_zero=0;
#if 0
/* Check if FileM is active. If so then keep processing. */
@ -594,6 +595,10 @@ static void check_job_complete(orte_job_t *jdata)
continue;
}
if (0 == first_non_zero && 0 != proc->exit_code) {
first_non_zero = proc->exit_code;
}
/*
* Determine how the process state affects the job state
*/
@ -727,6 +732,8 @@ static void check_job_complete(orte_job_t *jdata)
/* turn off any sensor monitors on this job */
orte_sensor.stop(jdata->jobid);
#endif
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(first_non_zero);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit
all: $(PROGS)

47
orte/test/system/orte_exit.c Обычный файл
Просмотреть файл

@ -0,0 +1,47 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just spins, with vpid 3 aborting - provides mechanism for testing
* abnormal program termination
*/
#include <stdio.h>
#include <unistd.h>
#include "orte/runtime/runtime.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
int main(int argc, char* argv[])
{
int i, rc;
double pi;
pid_t pid;
char hostname[500];
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
fprintf(stderr, "orte_abort: couldn't init orte - error code %d\n", rc);
return rc;
}
pid = getpid();
gethostname(hostname, 500);
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
hostname, (long)pid);
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i == 9995) {
orte_finalize();
exit(ORTE_PROC_MY_NAME->vpid);
}
}
return 0;
}