1
1

Add a Stacktrace feature, which figures where/what signal has happened

after MPI-startup.
For this a new mpirun-parameter "mpi_signal" is added, one may specify a
comma-separated list of signals to grab, e.g. mpirun --mca mpi_signal 8,11
will check for SIGFPE and SIGSEGV.
It only finds the first fault (SA_ONESHOT), as after the return the same
fault will occur again.

As printout, the data provided by siginfo_t is printed to STDOUT (yes,
it calls printf ,-]).
Additionally, with glibc, it uses backtrace and backtrace_symbols to 
print the calling stack up to the function in which the signal was raised:

(Rank:0) Going to write to RD_ONLY mmaped shared mem
Signal:11 info.si_errno:0(Success) si_code:2(SEGV_ACCERR)
Failing at addr:0x4020c000
[0] func:/home/rusraink/ompi-gcc/lib/libmpi.so.0 [0x40121afe]
[1] func:./t0 [0x42029180]
[2] func:./t0(__libc_start_main+0x95) [0x42017589]
[3] func:./t0(__libc_start_main+0x49) [0x8048691]

This commit was SVN r4170.
Этот коммит содержится в:
Rainer Keller 2005-01-26 19:11:46 +00:00
родитель 46c2c11680
Коммит 6ee5a29c2f
6 изменённых файлов: 337 добавлений и 2 удалений

Просмотреть файл

@ -835,9 +835,9 @@ AC_DEFINE_UNQUOTED(OMPI_ALIGNMENT_F90_COMPLEX32, $OMPI_ALIGNMENT_F90_COMPLEX32,
ompi_show_title "Header file tests"
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h dlfcn.h \
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h dlfcn.h execinfo.h \
sys/fcntl.h inttypes.h libgen.h net/if.h netdb.h netinet/in.h netinet/tcp.h \
sys/stat.h poll.h pthread.h pwd.h sched.h stdint.h strings.h stropts.h \
sys/stat.h poll.h pthread.h pwd.h sched.h stdint.h string.h strings.h stropts.h \
sys/types.h sys/ipc.h sys/mman.h sys/resource.h sys/select.h sys/socket.h \
sys/ioctl.h err.h sys/statvfs.h sys/time.h sys/uio.h sys/utsname.h sys/wait.h \
syslog.h termios.h ulimit.h unistd.h sys/param.h sys/tree.h sys/queue.h])

Просмотреть файл

@ -27,6 +27,7 @@
#include "group/group.h"
#include "info/info.h"
#include "util/show_help.h"
#include "util/stacktrace.h"
#include "errhandler/errcode.h"
#include "errhandler/errclass.h"
#include "request/request.h"
@ -125,6 +126,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error;
}
if (OMPI_SUCCESS != (ret = ompi_util_register_stackhandlers ())) {
error = "util_register_stackhandlers() failed";
goto error;
}
/* initialize ompi procs */
if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
error = "mca_proc_init() failed";

Просмотреть файл

@ -61,6 +61,11 @@ int ompi_mpi_register_params(void)
}
}
/*
* This string is going to be used in src/util/showstackframe.c
*/
mca_base_param_register_string("mpi", NULL, "signal", NULL, NULL);
/* Whether or not to show MPI handle leaks */
show_leaks_param =

Просмотреть файл

@ -44,6 +44,7 @@ headers = \
session_dir.h \
show_help.h \
show_help_lex.h \
stacktrace.h \
daemon_init.h \
universe_setup_file_io.h \
strncpy.h
@ -69,6 +70,7 @@ libutil_la_SOURCES = \
session_dir.c \
show_help.c \
show_help_lex.l \
stacktrace.c \
daemon_init.c \
universe_setup_file_io.c \
strncpy.c

269
src/util/stacktrace.c Обычный файл
Просмотреть файл

@ -0,0 +1,269 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/constants.h"
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#include "util/stacktrace.h"
#include "mca/base/mca_base_param.h"
#ifndef _NSIG
#define _NSIG 32
#endif
/**
* This function is being called as a signal-handler in response
* to a user-specified signal (e.g. SIGFPE or SIGSEGV).
* For Linux/Glibc, it then uses backtrace and backtrace_symbols
* to figure the current stack and then prints that out to stdout.
* Yes, printf and malloc are not signal-safe per se, but should be
* on Linux?
*
* @param signo with the signal number raised
* @param info with information regarding the reason/send of the signal
* @param p
*
*/
#ifndef WIN32
static void ompi_show_stackframe (int signo, siginfo_t * info, void * p)
{
#ifdef __GLIBC__
int i;
int trace_size;
void * trace[32];
char ** messages = (char **)NULL;
#endif
char print_buffer[1024];
char * tmp = print_buffer;
int size = sizeof (print_buffer);
int ret;
char * str;
/*
* Yes, we are doing printf inside a signal-handler.
* However, backtrace itself calls malloc (which may not be signal-safe,
* under linux, printf and malloc are)
*
* We could use backtrace_symbols_fd and write directly into an
* filedescriptor, however, without formatting -- also this fd
* should be opened in a sensible way...
*/
memset (print_buffer, 0, sizeof (print_buffer));
switch (signo)
{
case SIGILL:
switch (info->si_code)
{
case ILL_ILLOPC: str = "ILL_ILLOPC"; break;
case ILL_ILLOPN: str = "ILL_ILLOPN"; break;
case ILL_ILLADR: str = "ILL_ILLADR"; break;
case ILL_ILLTRP: str = "ILL_ILLTRP"; break;
case ILL_PRVOPC: str = "ILL_PRVOPC"; break;
case ILL_PRVREG: str = "ILL_PRVREG"; break;
case ILL_COPROC: str = "ILL_COPROC"; break;
case ILL_BADSTK: str = "ILL_BADSTK"; break;
}
break;
case SIGFPE:
switch (info->si_code)
{
case FPE_INTDIV: str = "FPE_INTDIV"; break;
case FPE_INTOVF: str = "FPE_INTOVF"; break;
case FPE_FLTDIV: str = "FPE_FLTDIV"; break;
case FPE_FLTOVF: str = "FPE_FLTOVF"; break;
case FPE_FLTUND: str = "FPE_FLTUND"; break;
case FPE_FLTRES: str = "FPE_FLTRES"; break;
case FPE_FLTINV: str = "FPE_FLTINV"; break;
case FPE_FLTSUB: str = "FPE_FLTSUB"; break;
}
break;
case SIGSEGV:
switch (info->si_code)
{
case SEGV_MAPERR: str = "SEGV_MAPERR"; break;
case SEGV_ACCERR: str = "SEGV_ACCERR"; break;
}
break;
case SIGBUS:
switch (info->si_code)
{
case BUS_ADRALN: str = "BUS_ADRALN"; break;
case BUS_ADRERR: str = "BUS_ADRERR"; break;
case BUS_OBJERR: str = "BUS_OBJERR"; break;
}
break;
case SIGTRAP:
switch (info->si_code)
{
case TRAP_BRKPT: str = "TRAP_BRKPT"; break;
case TRAP_TRACE: str = "TRAP_TRACE"; break;
}
break;
case SIGCHLD:
switch (info->si_code)
{
case CLD_EXITED: str = "CLD_EXITED"; break;
case CLD_KILLED: str = "CLD_KILLED"; break;
case CLD_DUMPED: str = "CLD_DUMPED"; break;
case CLD_TRAPPED: str = "CLD_TRAPPED"; break;
case CLD_STOPPED: str = "CLD_STOPPED"; break;
case CLD_CONTINUED: str = "CLD_CONTINUED"; break;
}
break;
case SIGPOLL:
switch (info->si_code)
{
case POLL_IN: str = "POLL_IN"; break;
case POLL_OUT: str = "POLL_OUT"; break;
case POLL_MSG: str = "POLL_MSG"; break;
case POLL_ERR: str = "POLL_ERR"; break;
case POLL_PRI: str = "POLL_PRI"; break;
case POLL_HUP: str = "POLL_HUP"; break;
}
break;
default:
switch (info->si_code)
{
case SI_ASYNCNL: str = "SI_ASYNCNL"; break;
case SI_SIGIO: str = "SI_SIGIO"; break;
case SI_ASYNCIO: str = "SI_ASYNCIO"; break;
case SI_MESGQ: str = "SI_MESGQ"; break;
case SI_TIMER: str = "SI_TIMER"; break;
case SI_QUEUE: str = "SI_QUEUE"; break;
case SI_USER: str = "SI_USER"; break;
case SI_KERNEL: str = "SI_KERNEL"; break;
}
}
ret = snprintf (tmp, size, "Signal:%d info.si_errno:%d(%s) si_code:%d(%s)\n",
signo, info->si_errno, strerror (info->si_errno),
info->si_code, str);
size -= ret;
tmp += ret;
switch (signo)
{
case SIGILL:
case SIGFPE:
case SIGSEGV:
case SIGBUS:
{
ret = snprintf (tmp, size, "Failing at addr:%p\n",
info->si_addr);
size -= ret;
tmp += ret;
break;
}
case SIGCHLD: {
ret = snprintf (tmp, size, "si_pid:%d si_uid:%d si_status:%d si_utime:%d, si_stime:%d\n",
info->si_pid, info->si_uid, info->si_status,
info->si_utime, info->si_stime);
size -= ret;
tmp += ret;
break;
}
case SIGPOLL: {
ret = snprintf (tmp, size, "si_band:%ld si_fd:%d\n",
info->si_band, info->si_fd);
size -= ret;
tmp += ret;
break;
}
}
printf ("%s", print_buffer);
#ifdef __GLIBC__
trace_size = backtrace (trace, 32);
messages = backtrace_symbols (trace, trace_size);
for (i = 0; i < trace_size; i++)
printf ("[%d] func:%s\n", i, messages[i]);
#endif
}
#endif /* WIN32 */
/**
* Here we register the ompi_show_stackframe function for signals
* passed to OpenMPI by the mpi_signal-parameter passed to mpirun
* by the user.
*
* @returnvalue OMPI_SUCCESS
* @returnvalue OMPI_ERR_BAD_PARAM if the value in the signal-list
* is not a valid signal-number
*
*/
int ompi_util_register_stackhandlers (void)
{
#ifndef WIN32
struct sigaction act;
char * string_value;
char * tmp;
char * next;
int param;
param = mca_base_param_find ("mpi", NULL, "signal");
mca_base_param_lookup_string (param, &string_value);
memset(&act, 0, sizeof(act));
act.sa_sigaction = ompi_show_stackframe;
act.sa_flags = SA_SIGINFO | SA_ONESHOT;
for (tmp = next = string_value ;
next != NULL && *next != '\0';
tmp = next + 1)
{
int sig;
int ret;
sig = strtol (tmp, &next, 10);
/*
printf ("ompi_util_register_stackhandlers: sig:%d tmp:%p next:%p "
"tmp:[%s] next:[%s] _NSIG:%d\n",
sig, tmp, next, tmp, next, _NSIG);
*/
/*
* If there is no sensible number in the string, exit.
* Similarly for any number which is not in the signal-number range
*/
if (((0 == sig) && (tmp == next)) || (0 > sig) || (_NSIG <= sig))
return OMPI_ERR_BAD_PARAM;
if ((next == NULL) || ((*next != ',') && (*next != '\0')))
return OMPI_ERR_BAD_PARAM;
ret = sigaction (sig, &act, NULL);
if (ret != 0)
return OMPI_ERR_IN_ERRNO;
}
#endif /* WIN32 */
return OMPI_SUCCESS;
}

53
src/util/stacktrace.h Обычный файл
Просмотреть файл

@ -0,0 +1,53 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_STACKTRACE_H
#define OMPI_STACKTRACE_H
#include "ompi_config.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
/**
* This function is being called as a signal-handler in response
* to a user-specified signal (e.g. SIGFPE or SIGSEGV).
* For Linux/Glibc, it then uses backtrace and backtrace_symbols
* to figure the current stack and then prints that out to stdout.
* Yes, printf and malloc are not signal-safe per se, but should be
* on Linux?
*
* @param signo with the signal number raised
* @param act with information regarding the reason/send of the signal
* @param oldact
*
*/
#ifndef WIN32
static void ompi_show_stackframe (int signo, siginfo_t * info, void * p);
#endif
/**
* Here we register the ompi_show_stackframe function for signals
* passed to OpenMPI by the mpi_signal-parameter passed to mpirun
* by the user.
*
* @returnvalue OMPI_SUCCESS
* @returnvalue OMPI_ERR_BAD_PARAM if the value in the signal-list
* is not a valid signal-number
*
*/
OMPI_DECLSPEC int ompi_util_register_stackhandlers (void);
#endif /* OMPI_STACKTRACE_H */