Add in the standardized way to notify a debugger if the MPI job is
about to abort. Fixes trac:1509. This commit was SVN r19596. The following Trac tickets were found above: Ticket 1509 --> https://svn.open-mpi.org/trac/ompi/ticket/1509
This commit is contained in:
parent
16561fa297
commit
5fd742e769
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -35,6 +35,11 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
OMPI_DECLSPEC void ompi_wait_for_debugger(void);
|
||||
|
||||
/**
|
||||
* Notify a debugger that we're about to abort
|
||||
*/
|
||||
OMPI_DECLSPEC void ompi_debugger_notify_abort(char *string);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OMPI_DEBUGGERS_H */
|
||||
|
@ -90,6 +90,13 @@ OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = {
|
||||
sizeof(size_t)
|
||||
};
|
||||
|
||||
/*
|
||||
* Values defined by the standardized interface; do not change these
|
||||
* values
|
||||
*/
|
||||
#define MPIR_DEBUG_SPAWNED 1
|
||||
#define MPIR_DEBUG_ABORTING 2
|
||||
|
||||
/**
|
||||
* There is an issue with the debugger running on different architectures
|
||||
* compared with the debugged program. We need to know the sizes of the types
|
||||
@ -114,6 +121,8 @@ OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
|
||||
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
|
||||
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
|
||||
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
|
||||
|
||||
/* Check for a file in few direct ways for portability */
|
||||
static void check(char *dir, char *file, char **locations)
|
||||
@ -238,3 +247,30 @@ void ompi_wait_for_debugger(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Breakpoint function for parallel debuggers. This function is also
|
||||
* defined in orterun for the starter. It should never conflict with
|
||||
* this one, but we'll make it static, just to be sure.
|
||||
*/
|
||||
static void *MPIR_Breakpoint(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell the debugger that we are about to abort
|
||||
*/
|
||||
void ompi_debugger_notify_abort(char *reason)
|
||||
{
|
||||
MPIR_debug_state = MPIR_DEBUG_ABORTING;
|
||||
|
||||
if (NULL != reason && strlen(reason) > 0) {
|
||||
MPIR_debug_abort_string = reason;
|
||||
} else {
|
||||
MPIR_debug_abort_string = "Unknown";
|
||||
}
|
||||
|
||||
/* Now tell the debugger */
|
||||
MPIR_Breakpoint();
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -213,5 +214,5 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
|
||||
comm = &ompi_mpi_comm_self;
|
||||
}
|
||||
|
||||
ompi_mpi_abort(comm, 1, false);
|
||||
ompi_mpi_abort(comm, *error_code, false);
|
||||
}
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -42,6 +42,8 @@
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
#include "ompi/runtime/params.h"
|
||||
#include "ompi/debuggers/debuggers.h"
|
||||
#include "ompi/errhandler/errcode.h"
|
||||
|
||||
static bool have_been_invoked = false;
|
||||
|
||||
@ -51,7 +53,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
bool kill_remote_of_intercomm)
|
||||
{
|
||||
int count = 0, i;
|
||||
char *host, hostname[MAXHOSTNAMELEN];
|
||||
char *msg, *host, hostname[MAXHOSTNAMELEN];
|
||||
pid_t pid = 0;
|
||||
orte_process_name_t *abort_procs;
|
||||
orte_std_cntr_t nabort_procs;
|
||||
@ -94,6 +96,19 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
}
|
||||
|
||||
/* Notify the debugger that we're about to abort */
|
||||
|
||||
if (asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
|
||||
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
|
||||
ompi_mpi_abort_print_stack ?
|
||||
" (stack trace available on stderr)" : "") < 0) {
|
||||
msg = NULL;
|
||||
}
|
||||
ompi_debugger_notify_abort(msg);
|
||||
if (NULL != msg) {
|
||||
free(msg);
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != ompi_mpi_abort_delay) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user