Apply opal_abort_delay
to the signal handler
This commit expands the effect of the MCA parameter `opal_abort_delay` to the OPAL signal handler. This allows attaching of a debugger on segmentation fault etc. before quitting the job. The sleep code is moved to the `opal_delay_abort` function from the `ompi_mpi_abort` and `oshmem_shmem_abort` functions for code cleanup. Signed-off-by: KAWASHIMA Takahiro <t-kawashima@jp.fujitsu.com>
Этот коммит содержится в:
родитель
7002535059
Коммит
6b91eddc8b
@ -18,6 +18,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -42,6 +43,7 @@
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "ompi/communicator/communicator.h"
|
||||
@ -159,24 +161,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
/* Wait for a while before aborting */
|
||||
opal_delay_abort();
|
||||
|
||||
/* If the RTE isn't setup yet/any more, then don't even try
|
||||
killing everyone. Sorry, Charlie... */
|
||||
|
@ -14,6 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,9 +28,12 @@
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#define MAX_CONVERTERS 5
|
||||
#define MAX_CONVERTER_PROJECT_LEN 10
|
||||
@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
|
||||
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
opal_delay_abort(void)
|
||||
{
|
||||
// Though snprintf and strlen are not guaranteed to be async-signal-safe
|
||||
// in POSIX, it is async-signal-safe on many implementations probably.
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
int delay = opal_abort_delay;
|
||||
pid_t pid = getpid();
|
||||
char msg[100 + OPAL_MAXHOSTNAMELEN];
|
||||
|
||||
if (delay < 0) {
|
||||
snprintf(msg, sizeof(msg),
|
||||
"[%s:%05d] Looping forever "
|
||||
"(MCA parameter opal_abort_delay is < 0)\n",
|
||||
opal_process_info.nodename, (int) pid);
|
||||
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
snprintf(msg, sizeof(msg),
|
||||
"[%s:%05d] Delaying for %d seconds before aborting\n",
|
||||
opal_process_info.nodename, (int) pid, delay);
|
||||
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--delay > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
|
||||
int err_base, int err_max,
|
||||
opal_err2str_fn_t converter);
|
||||
|
||||
/**
|
||||
* Print a message and sleep in accordance with the opal_abort_delay value
|
||||
*
|
||||
* This function is (almost) async-thread-safe so it can be called from
|
||||
* a signal handler.
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_delay_abort(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OPAL_UTIL_ERROR_H */
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -45,6 +46,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#ifndef _NSIG
|
||||
@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
|
||||
opal_stacktrace_output_fileno = -1;
|
||||
}
|
||||
|
||||
/* wait for a while before aborting for debugging */
|
||||
opal_delay_abort();
|
||||
|
||||
/* Raise the signal again, so we don't accidentally mask critical signals.
|
||||
* For critical signals, it is preferred that we call 'raise' instead of
|
||||
* 'exit' or 'abort' so that the return status is set properly for this
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,6 +25,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
/* Wait for a while before aborting */
|
||||
opal_delay_abort();
|
||||
|
||||
if (!orte_initialized || !oshmem_shmem_initialized) {
|
||||
if (orte_show_help_is_available()) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user