Add two MCA parameters to the MPI level to control behavior during
MPI_ABORT. From the ompi_info output: MCA mpi: parameter "mpi_abort_delay" (current value: "0") If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job. MCA mpi: parameter "mpi_abort_print_stack" (current value: "0") If nonzero, print out a stack trace when MPI_ABORT is invoked This commit was SVN r9487.
Этот коммит содержится в:
родитель
1d67917b69
Коммит
fd61d78599
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -21,16 +22,29 @@
|
|||||||
#ifdef HAVE_UNISTD_H
|
#ifdef HAVE_UNISTD_H
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAVE_EXECINFO_H
|
||||||
|
#include <execinfo.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_PARAM_H
|
||||||
|
#include <sys/param.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_NETDB_H
|
||||||
|
#include <netdb.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ompi/communicator/communicator.h"
|
#include "opal/event/event.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "ompi/runtime/mpiruntime.h"
|
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
#include "orte/mca/ns/ns.h"
|
#include "orte/mca/ns/ns.h"
|
||||||
#include "orte/mca/rmgr/rmgr.h"
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "ompi/communicator/communicator.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
#include "opal/event/event.h"
|
#include "ompi/runtime/mpiruntime.h"
|
||||||
|
#include "ompi/runtime/params.h"
|
||||||
|
|
||||||
#if HAVE_SIGNAL_H
|
#if HAVE_SIGNAL_H
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
@ -67,7 +81,9 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
bool kill_remote_of_intercomm)
|
bool kill_remote_of_intercomm)
|
||||||
{
|
{
|
||||||
orte_jobid_t my_jobid;
|
orte_jobid_t my_jobid;
|
||||||
int ret=OMPI_SUCCESS;
|
int ret = OMPI_SUCCESS;
|
||||||
|
char hostname[MAXHOSTNAMELEN];
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
/* Corner case: if we're being called as a result of the
|
/* Corner case: if we're being called as a result of the
|
||||||
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
||||||
@ -78,6 +94,54 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
exit(errcode);
|
exit(errcode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If we're going to print anything, get the hostname and PID of
|
||||||
|
this process */
|
||||||
|
|
||||||
|
if (ompi_mpi_abort_print_stack ||
|
||||||
|
0 != ompi_mpi_abort_delay) {
|
||||||
|
gethostname(hostname, sizeof(hostname));
|
||||||
|
pid = getpid();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Should we print a stack trace? */
|
||||||
|
|
||||||
|
if (ompi_mpi_abort_print_stack) {
|
||||||
|
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
|
||||||
|
int i;
|
||||||
|
int trace_size;
|
||||||
|
void *trace[32];
|
||||||
|
char **messages = (char **)NULL;
|
||||||
|
|
||||||
|
trace_size = backtrace(trace, 32);
|
||||||
|
messages = backtrace_symbols(trace, trace_size);
|
||||||
|
|
||||||
|
for (i = 0; i < trace_size; ++i) {
|
||||||
|
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid,
|
||||||
|
i, messages[i]);
|
||||||
|
fflush(stderr);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Should we wait for a while before aborting? */
|
||||||
|
|
||||||
|
if (0 != ompi_mpi_abort_delay) {
|
||||||
|
if (ompi_mpi_abort_delay < 0) {
|
||||||
|
fprintf(stderr ,"[%s:%d] Looping forever in MPI abort\n",
|
||||||
|
hostname, (int) pid);
|
||||||
|
fflush(stderr);
|
||||||
|
while (1) {
|
||||||
|
sleep(5);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "[%s:%d] Delaying for %d seconds in MPI_abort\n",
|
||||||
|
hostname, (int) pid, ompi_mpi_abort_delay);
|
||||||
|
do {
|
||||||
|
sleep(1);
|
||||||
|
} while (--ompi_mpi_abort_delay > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* BWB - XXX - Should probably publish the error code somewhere */
|
/* BWB - XXX - Should probably publish the error code somewhere */
|
||||||
|
|
||||||
/* Kill everyone in the job. We may make this better someday to
|
/* Kill everyone in the job. We may make this better someday to
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -43,7 +44,8 @@ bool ompi_debug_no_free_handles = false;
|
|||||||
bool ompi_mpi_show_mca_params = false;
|
bool ompi_mpi_show_mca_params = false;
|
||||||
char *ompi_mpi_show_mca_params_file = NULL;
|
char *ompi_mpi_show_mca_params_file = NULL;
|
||||||
bool ompi_mpi_paffinity_alone = false;
|
bool ompi_mpi_paffinity_alone = false;
|
||||||
|
bool ompi_mpi_abort_generate_stack_trace = false;
|
||||||
|
int ompi_mpi_abort_delay = 0;
|
||||||
|
|
||||||
int ompi_mpi_register_params(void)
|
int ompi_mpi_register_params(void)
|
||||||
{
|
{
|
||||||
@ -136,6 +138,36 @@ int ompi_mpi_register_params(void)
|
|||||||
true, false,
|
true, false,
|
||||||
-1, NULL);
|
-1, NULL);
|
||||||
|
|
||||||
|
/* MPI_ABORT controls */
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("mpi", "abort_delay",
|
||||||
|
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
|
||||||
|
false, false,
|
||||||
|
ompi_mpi_abort_delay,
|
||||||
|
&ompi_mpi_abort_delay);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("mpi", "abort_print_stack",
|
||||||
|
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
|
||||||
|
false,
|
||||||
|
/* If we do not have stack trace
|
||||||
|
capability, make this a read-only
|
||||||
|
MCA param */
|
||||||
|
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
|
||||||
|
false,
|
||||||
|
#else
|
||||||
|
true,
|
||||||
|
#endif
|
||||||
|
(int) ompi_mpi_abort_print_stack,
|
||||||
|
&value);
|
||||||
|
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
|
||||||
|
/* Only take the value if we have stack trace capability */
|
||||||
|
ompi_mpi_abort_print_stack = (bool) value;
|
||||||
|
#else
|
||||||
|
/* If we do not have stack trace capability, ensure that this is
|
||||||
|
hard-coded to false */
|
||||||
|
ompi_mpi_abort_print_stack = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* The ddt engine has a few parameters */
|
/* The ddt engine has a few parameters */
|
||||||
|
|
||||||
return ompi_ddt_register_params();
|
return ompi_ddt_register_params();
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -88,6 +89,22 @@ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
|
|||||||
*/
|
*/
|
||||||
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
|
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether an MPI_ABORT should print out a stack trace or not.
|
||||||
|
*/
|
||||||
|
OMPI_DECLSPEC bool ompi_mpi_abort_print_stack;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether MPI_ABORT should print out an identifying message
|
||||||
|
* (e.g., hostname and PID) and loop waiting for a debugger to
|
||||||
|
* attach. The value of the integer is how many seconds to wait:
|
||||||
|
*
|
||||||
|
* 0 = do not print the message and do not loop
|
||||||
|
* negative value = print the message and loop forever
|
||||||
|
* positive value = print the message and delay for that many seconds
|
||||||
|
*/
|
||||||
|
OMPI_DECLSPEC int ompi_mpi_abort_delay;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Register MCA parameters used by the MPI layer.
|
* Register MCA parameters used by the MPI layer.
|
||||||
*
|
*
|
||||||
@ -98,6 +115,7 @@ OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
|
|||||||
*/
|
*/
|
||||||
OMPI_DECLSPEC int ompi_mpi_register_params(void);
|
OMPI_DECLSPEC int ompi_mpi_register_params(void);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Display all MCA parameters used
|
* Display all MCA parameters used
|
||||||
*
|
*
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user