1
1

ORTE defaults to killing the entire job when any process exits with a

nonzero status (we polled other MPI implementations since one one in
the OMPI community had a concrete opinion on what behavior to do here
-- all other MPI's seem to adhere to this behavior, too).

This commit adds an MCA parameter that allows us to tell ORTE to
''not'' kill jobs when a process exits with a status of 77, meaning
the GNU testing standard of "this test was skipped".  In all the OMPI
tests, all procs will either return 77 or not.  So if they all return
77, mpirun won't consider it an error, but will still return an exit
status of 77 (so that MTT can know that the test was cleanly skipped).

This commit was SVN r26413.
Этот коммит содержится в:
Jeff Squyres 2012-05-08 21:49:05 +00:00
родитель c00d1d4f91
Коммит 02aa36f2e5
3 изменённых файлов: 21 добавлений и 3 удалений

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -2073,6 +2073,17 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
state = ORTE_PROC_STATE_CALLED_ABORT;
goto MOVEON;
}
/* If the exit status of this proc was 77 and the
odls_base_exit_status_77_fatal MCA param was set to false,
then don't kill the whole job. The rationale is that the
GNU testing standards specify that an exit status of 77
indicates that a test was skipped -- it should not be
treated as a fatal error (to the whole job). */
if (!orte_odls_globals.is_exit_status_77_fatal && 77 == proc->exit_code) {
state = ORTE_PROC_STATE_WAITPID_FIRED;
goto MOVEON;
}
/* check to see if a sync was required and if it was received */
if (proc->registered) {

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
@ -103,6 +103,11 @@ int orte_odls_base_open(void)
"Time to wait for a process to die after issuing a kill signal to it",
false, false, 1, &orte_odls_globals.timeout_before_sigkill);
mca_base_param_reg_int_name("odls", "base_exit_status_77_fatal",
"Whether to kill an entire job if any process in that job exits normally with a status of 77 (exit status 77 in the GNU testing standards means \"this test was skipped\", and therefore we wouldn't want to kill the entire job)",
false, false, 1, &i);
orte_odls_globals.is_exit_status_77_fatal = OPAL_INT_TO_BOOL(i);
/* initialize the global array of local children */
orte_local_children = OBJ_NEW(opal_pointer_array_t);
if (OPAL_SUCCESS != (rc = opal_pointer_array_init(orte_local_children,

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -62,6 +62,8 @@ typedef struct {
opal_list_t xterm_ranks;
/* the xterm cmd to be used */
char **xtermcmd;
/* whether to consider an exit code of 77 fatal to a job or not */
bool is_exit_status_77_fatal;
} orte_odls_globals_t;
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;