diff --git a/opal/mca/backtrace/backtrace.h b/opal/mca/backtrace/backtrace.h index 7f18fd8493..9ca5658cde 100644 --- a/opal/mca/backtrace/backtrace.h +++ b/opal/mca/backtrace/backtrace.h @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,6 +27,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "opal/util/stacktrace.h" BEGIN_C_DECLS @@ -39,6 +41,8 @@ BEGIN_C_DECLS /* * Print back trace to FILE file with a prefix for each line. * First strip lines are not printed. + * If 'file' is NULL then the component should try to use the file descriptor + * saved in opal_stacktrace_output_fileno * * \note some attempts made to be signal safe. */ diff --git a/opal/mca/backtrace/execinfo/backtrace_execinfo.c b/opal/mca/backtrace/execinfo/backtrace_execinfo.c index faa30be983..0f17c514c1 100644 --- a/opal/mca/backtrace/execinfo/backtrace_execinfo.c +++ b/opal/mca/backtrace/execinfo/backtrace_execinfo.c @@ -10,6 +10,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,12 +35,16 @@ int opal_backtrace_print(FILE *file, char *prefix, int strip) { - int i, fd, len; + int i, len; int trace_size; void * trace[32]; char buf[6]; + int fd = opal_stacktrace_output_fileno; + + if( NULL != file ) { + fd = fileno(file); + } - fd = fileno (file); if (-1 == fd) { return OPAL_ERR_BAD_PARAM; } diff --git a/opal/mca/backtrace/printstack/backtrace_printstack.c b/opal/mca/backtrace/printstack/backtrace_printstack.c index 699c839efe..214cacfb14 100644 --- a/opal/mca/backtrace/printstack/backtrace_printstack.c +++ b/opal/mca/backtrace/printstack/backtrace_printstack.c @@ -10,6 +10,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,7 +28,13 @@ int opal_backtrace_print(FILE *file, char *prefix, int strip) { - printstack(fileno(file)); + int fd = opal_stacktrace_output_fileno; + + if( NULL != file ) { + fd = fileno(file); + } + + printstack(fd); return OPAL_SUCCESS; } diff --git a/opal/runtime/opal_params.c b/opal/runtime/opal_params.c index 0292e1c4c7..a44ebc816e 100644 --- a/opal/runtime/opal_params.c +++ b/opal/runtime/opal_params.c @@ -21,6 +21,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,6 +49,7 @@ #include "opal/util/timings.h" char *opal_signal_string = NULL; +char *opal_stacktrace_output_filename = NULL; char *opal_net_private_ipv4 = NULL; char *opal_set_max_sys_limits = NULL; @@ -76,6 +78,7 @@ static bool opal_register_done = false; int opal_register_params(void) { int ret; + char *string = NULL; if (opal_register_done) { return OPAL_SUCCESS; @@ -87,7 +90,6 @@ int opal_register_params(void) * This string is going to be used in opal/util/stacktrace.c */ { - char *string = NULL; int j; int signals[] = { #ifdef SIGABRT @@ -127,6 +129,28 @@ int opal_register_params(void) } } + /* + * Where should the stack trace output be directed + * This string is going to be used in opal/util/stacktrace.c + */ + string = strdup("stderr"); + opal_stacktrace_output_filename = string; + ret = mca_base_var_register ("opal", "opal", NULL, "stacktrace_output", + "Specifies where the stack trace output stream goes. " + "Accepts one of the following: none (disabled), stderr (default), stdout, file[:filename]. " + "If 'filename' is not specified, a default filename of 'stacktrace' is used. " + "The 'filename' is appended with either '.PID' or '.RANK.PID', if RANK is available. " + "The 'filename' can be an absolute path or a relative path to the current working directory.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, + &opal_stacktrace_output_filename); + free (string); + if (0 > ret) { + return ret; + } + + #if defined(HAVE_SCHED_YIELD) opal_progress_yield_when_idle = false; ret = mca_base_var_register ("opal", "opal", "progress", "yield_when_idle", diff --git a/opal/runtime/opal_params.h b/opal/runtime/opal_params.h index 8e810ecf4b..e90bf73fb2 100644 --- a/opal/runtime/opal_params.h +++ b/opal/runtime/opal_params.h @@ -18,6 +18,7 @@ * Copyright (c) 2014 Hochschule Esslingen. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,7 @@ #define OPAL_PARAMS_H extern char *opal_signal_string; +extern char *opal_stacktrace_output_filename; extern char *opal_net_private_ipv4; extern char *opal_set_max_sys_limits; diff --git a/opal/util/stacktrace.c b/opal/util/stacktrace.c index 356c69f9f8..58f3c924b4 100644 --- a/opal/util/stacktrace.c +++ b/opal/util/stacktrace.c @@ -25,6 +25,15 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif +#ifdef HAVE_SYS_FCNTL_H +#include +#endif #include #include @@ -35,6 +44,7 @@ #include "opal/util/output.h" #include "opal/util/show_help.h" #include "opal/util/argv.h" +#include "opal/util/proc.h" #include "opal/runtime/opal_params.h" #ifndef _NSIG @@ -43,9 +53,35 @@ #define HOSTFORMAT "[%s:%05d] " +int opal_stacktrace_output_fileno = -1; +static char *opal_stacktrace_output_filename_base = NULL; +static size_t opal_stacktrace_output_filename_max_len = 0; static char stacktrace_hostname[OPAL_MAXHOSTNAMELEN]; static char *unable_to_print_msg = "Unable to print stack trace!\n"; +/* + * Set the stacktrace filename: + * stacktrace.PID + * -or, if VPID is available- + * stacktrace.VPID.PID + */ +static void set_stacktrace_filename(void) { + opal_proc_t *my_proc = opal_proc_local_get(); + + if( NULL == my_proc ) { + snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len, + "%s.%lu", + opal_stacktrace_output_filename_base, (unsigned long)getpid()); + } + else { + snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len, + "%s.%lu.%lu", + opal_stacktrace_output_filename_base, (unsigned long)my_proc->proc_name.vpid, (unsigned long)getpid()); + } + + return; +} + /** * This function is being called as a signal-handler in response * to a user-specified signal (e.g. SIGFPE or SIGSEGV). @@ -69,12 +105,37 @@ static void show_stackframe (int signo, siginfo_t * info, void * p) int ret; char *si_code_str = ""; + /* Do not print the stack trace */ + if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) { + /* Raise the signal again, so we don't accidentally mask critical signals. + * For critical signals, it is preferred that we call 'raise' instead of + * 'exit' or 'abort' so that the return status is set properly for this + * process. + */ + signal(signo, SIG_DFL); + raise(signo); + + return; + } + + /* Update the file name with the RANK, if available */ + if( 0 < opal_stacktrace_output_filename_max_len ) { + set_stacktrace_filename(); + opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename, + O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR); + if( 0 > opal_stacktrace_output_fileno ) { + opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s", + opal_stacktrace_output_filename, strerror(errno)); + opal_stacktrace_output_fileno = fileno(stderr); + } + } + /* write out the footer information */ memset (print_buffer, 0, sizeof (print_buffer)); ret = snprintf(print_buffer, sizeof(print_buffer), HOSTFORMAT "*** Process received signal ***\n", stacktrace_hostname, getpid()); - write(fileno(stderr), print_buffer, ret); + write(opal_stacktrace_output_fileno, print_buffer, ret); memset (print_buffer, 0, sizeof (print_buffer)); @@ -324,14 +385,14 @@ static void show_stackframe (int signo, siginfo_t * info, void * p) } /* write out the signal information generated above */ - write(fileno(stderr), print_buffer, sizeof(print_buffer)-size); + write(opal_stacktrace_output_fileno, print_buffer, sizeof(print_buffer)-size); /* print out the stack trace */ snprintf(print_buffer, sizeof(print_buffer), HOSTFORMAT, stacktrace_hostname, getpid()); - ret = opal_backtrace_print(stderr, print_buffer, 2); + ret = opal_backtrace_print(NULL, print_buffer, 2); if (OPAL_SUCCESS != ret) { - write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg)); + write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg)); } /* write out the footer information */ @@ -340,9 +401,15 @@ static void show_stackframe (int signo, siginfo_t * info, void * p) HOSTFORMAT "*** End of error message ***\n", stacktrace_hostname, getpid()); if (ret > 0) { - write(fileno(stderr), print_buffer, ret); + write(opal_stacktrace_output_fileno, print_buffer, ret); } else { - write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg)); + write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg)); + } + + if( fileno(stdout) != opal_stacktrace_output_fileno && + fileno(stderr) != opal_stacktrace_output_fileno ) { + close(opal_stacktrace_output_fileno); + opal_stacktrace_output_fileno = -1; } /* Raise the signal again, so we don't accidentally mask critical signals. @@ -373,7 +440,30 @@ void opal_stackframe_output(int stream) opal_output(stream, "%s", traces[i]); } } else { - opal_backtrace_print(stderr, NULL, 2); + /* Do not print the stack trace */ + if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) { + return; + } + + /* Update the file name with the RANK, if available */ + if( 0 < opal_stacktrace_output_filename_max_len ) { + set_stacktrace_filename(); + opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename, + O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR); + if( 0 > opal_stacktrace_output_fileno ) { + opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s", + opal_stacktrace_output_filename, strerror(errno)); + opal_stacktrace_output_fileno = fileno(stderr); + } + } + + opal_backtrace_print(NULL, NULL, 2); + + if( fileno(stdout) != opal_stacktrace_output_fileno && + fileno(stderr) != opal_stacktrace_output_fileno ) { + close(opal_stacktrace_output_fileno); + opal_stacktrace_output_fileno = -1; + } } } @@ -444,6 +534,50 @@ int opal_util_register_stackhandlers (void) } } + /* Setup the output stream to use */ + if( NULL == opal_stacktrace_output_filename || + 0 == strcasecmp(opal_stacktrace_output_filename, "none") ) { + opal_stacktrace_output_fileno = -1; + } + else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stdout") ) { + opal_stacktrace_output_fileno = fileno(stdout); + } + else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stderr") ) { + opal_stacktrace_output_fileno = fileno(stdout); + } + else if( 0 == strcasecmp(opal_stacktrace_output_filename, "file" ) || + 0 == strcasecmp(opal_stacktrace_output_filename, "file:") ) { + opal_stacktrace_output_filename_base = strdup("stacktrace"); + + free(opal_stacktrace_output_filename); + // Magic number: 8 = space for .PID and .RANK (allow 7 digits each) + opal_stacktrace_output_filename_max_len = strlen("stacktrace") + 8 + 8; + opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len); + set_stacktrace_filename(); + opal_stacktrace_output_fileno = -1; + } + else if( 0 == strncasecmp(opal_stacktrace_output_filename, "file:", 5) ) { + char *filename_cpy = NULL; + next = strchr(opal_stacktrace_output_filename, ':'); + next++; // move past the ':' to the filename specified + + opal_stacktrace_output_filename_base = strdup(next); + + free(opal_stacktrace_output_filename); + // Magic number: 8 = space for .PID and .RANK (allow 7 digits each) + opal_stacktrace_output_filename_max_len = strlen(opal_stacktrace_output_filename_base) + 8 + 8; + opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len); + set_stacktrace_filename(); + opal_stacktrace_output_fileno = -1; + + free(filename_cpy); + } + else { + opal_stacktrace_output_fileno = fileno(stderr); + } + + + /* Setup the signals to catch */ memset(&act, 0, sizeof(act)); act.sa_sigaction = show_stackframe; act.sa_flags = SA_SIGINFO; diff --git a/opal/util/stacktrace.h b/opal/util/stacktrace.h index 3703564d36..c4484ae91a 100644 --- a/opal/util/stacktrace.h +++ b/opal/util/stacktrace.h @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,6 +25,12 @@ #include "opal_config.h" +/* + * File descriptor to be used by the backtrace framework if opal_backtrace_print + * is passed NULL for it's FILE file pointer. + */ +extern int opal_stacktrace_output_fileno; + /** * Output the current stack trace (not including the call to this * function) to the stream indicated.