Merge pull request #3678 from kawashima-fj/pr/signal-abort-delay
Apply `opal_abort_delay` to the OPAL signal handler
Этот коммит содержится в:
Коммит
b5b6b22848
@ -193,7 +193,7 @@ static void backend_fatal_aggregate(char *type,
|
|||||||
arg = va_arg(arglist, char*);
|
arg = va_arg(arglist, char*);
|
||||||
va_end(arglist);
|
va_end(arglist);
|
||||||
|
|
||||||
if (asprintf(&prefix, "[%s:%d]",
|
if (asprintf(&prefix, "[%s:%05d]",
|
||||||
ompi_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
(int) ompi_process_info.pid) == -1) {
|
(int) ompi_process_info.pid) == -1) {
|
||||||
prefix = NULL;
|
prefix = NULL;
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -42,6 +43,7 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
#include "opal/mca/backtrace/backtrace.h"
|
#include "opal/mca/backtrace/backtrace.h"
|
||||||
|
#include "opal/util/error.h"
|
||||||
#include "opal/runtime/opal_params.h"
|
#include "opal/runtime/opal_params.h"
|
||||||
|
|
||||||
#include "ompi/communicator/communicator.h"
|
#include "ompi/communicator/communicator.h"
|
||||||
@ -146,7 +148,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
|
|
||||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||||
for (i = 0; i < len; ++i) {
|
for (i = 0; i < len; ++i) {
|
||||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
|
fprintf(stderr, "[%s:%05d] [%d] func:%s\n", host, (int) pid,
|
||||||
i, messages[i]);
|
i, messages[i]);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
@ -159,29 +161,13 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Should we wait for a while before aborting? */
|
/* Wait for a while before aborting */
|
||||||
|
opal_delay_abort();
|
||||||
if (0 != opal_abort_delay) {
|
|
||||||
if (opal_abort_delay < 0) {
|
|
||||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
|
||||||
host, (int) pid);
|
|
||||||
fflush(stderr);
|
|
||||||
while (1) {
|
|
||||||
sleep(5);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
|
||||||
host, (int) pid, opal_abort_delay);
|
|
||||||
do {
|
|
||||||
sleep(1);
|
|
||||||
} while (--opal_abort_delay > 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If the RTE isn't setup yet/any more, then don't even try
|
/* If the RTE isn't setup yet/any more, then don't even try
|
||||||
killing everyone. Sorry, Charlie... */
|
killing everyone. Sorry, Charlie... */
|
||||||
if (!ompi_rte_initialized) {
|
if (!ompi_rte_initialized) {
|
||||||
fprintf(stderr, "[%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
fprintf(stderr, "[%s:%05d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||||
host, (int) pid, ompi_mpi_finalized ?
|
host, (int) pid, ompi_mpi_finalized ?
|
||||||
"after MPI_FINALIZE started" : "before MPI_INIT completed");
|
"after MPI_FINALIZE started" : "before MPI_INIT completed");
|
||||||
_exit(errcode == 0 ? 1 : errcode);
|
_exit(errcode == 0 ? 1 : errcode);
|
||||||
|
@ -65,7 +65,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
|||||||
if (rc != OPAL_SUCCESS) {
|
if (rc != OPAL_SUCCESS) {
|
||||||
if (from_alloc) {
|
if (from_alloc) {
|
||||||
int len;
|
int len;
|
||||||
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in "
|
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
|
||||||
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
||||||
"will now abort.\n", opal_proc_local_get()->proc_hostname,
|
"will now abort.\n", opal_proc_local_get()->proc_hostname,
|
||||||
getpid(), base, (unsigned long) size);
|
getpid(), base, (unsigned long) size);
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -27,9 +28,12 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "opal/util/error.h"
|
#include "opal/util/error.h"
|
||||||
#include "opal/constants.h"
|
#include "opal/constants.h"
|
||||||
|
#include "opal/util/proc.h"
|
||||||
|
#include "opal/runtime/opal_params.h"
|
||||||
|
|
||||||
#define MAX_CONVERTERS 5
|
#define MAX_CONVERTERS 5
|
||||||
#define MAX_CONVERTER_PROJECT_LEN 10
|
#define MAX_CONVERTER_PROJECT_LEN 10
|
||||||
@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
|
|||||||
|
|
||||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
opal_delay_abort(void)
|
||||||
|
{
|
||||||
|
// Though snprintf and strlen are not guaranteed to be async-signal-safe
|
||||||
|
// in POSIX, it is async-signal-safe on many implementations probably.
|
||||||
|
|
||||||
|
if (0 != opal_abort_delay) {
|
||||||
|
int delay = opal_abort_delay;
|
||||||
|
pid_t pid = getpid();
|
||||||
|
char msg[100 + OPAL_MAXHOSTNAMELEN];
|
||||||
|
|
||||||
|
if (delay < 0) {
|
||||||
|
snprintf(msg, sizeof(msg),
|
||||||
|
"[%s:%05d] Looping forever "
|
||||||
|
"(MCA parameter opal_abort_delay is < 0)\n",
|
||||||
|
opal_process_info.nodename, (int) pid);
|
||||||
|
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||||
|
while (1) {
|
||||||
|
sleep(5);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
snprintf(msg, sizeof(msg),
|
||||||
|
"[%s:%05d] Delaying for %d seconds before aborting\n",
|
||||||
|
opal_process_info.nodename, (int) pid, delay);
|
||||||
|
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||||
|
do {
|
||||||
|
sleep(1);
|
||||||
|
} while (--delay > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
|
|||||||
int err_base, int err_max,
|
int err_base, int err_max,
|
||||||
opal_err2str_fn_t converter);
|
opal_err2str_fn_t converter);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Print a message and sleep in accordance with the opal_abort_delay value
|
||||||
|
*
|
||||||
|
* This function is (almost) async-thread-safe so it can be called from
|
||||||
|
* a signal handler.
|
||||||
|
*/
|
||||||
|
OPAL_DECLSPEC void opal_delay_abort(void);
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif /* OPAL_UTIL_ERROR_H */
|
#endif /* OPAL_UTIL_ERROR_H */
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -45,6 +46,7 @@
|
|||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
|
#include "opal/util/error.h"
|
||||||
#include "opal/runtime/opal_params.h"
|
#include "opal/runtime/opal_params.h"
|
||||||
|
|
||||||
#ifndef _NSIG
|
#ifndef _NSIG
|
||||||
@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
|
|||||||
opal_stacktrace_output_fileno = -1;
|
opal_stacktrace_output_fileno = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* wait for a while before aborting for debugging */
|
||||||
|
opal_delay_abort();
|
||||||
|
|
||||||
/* Raise the signal again, so we don't accidentally mask critical signals.
|
/* Raise the signal again, so we don't accidentally mask critical signals.
|
||||||
* For critical signals, it is preferred that we call 'raise' instead of
|
* For critical signals, it is preferred that we call 'raise' instead of
|
||||||
* 'exit' or 'abort' so that the return status is set properly for this
|
* 'exit' or 'abort' so that the return status is set properly for this
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -24,6 +25,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "opal/mca/backtrace/backtrace.h"
|
#include "opal/mca/backtrace/backtrace.h"
|
||||||
|
#include "opal/util/error.h"
|
||||||
#include "opal/runtime/opal_params.h"
|
#include "opal/runtime/opal_params.h"
|
||||||
|
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
@ -79,7 +81,7 @@ int oshmem_shmem_abort(int errcode)
|
|||||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||||
for (i = 0; i < len; ++i) {
|
for (i = 0; i < len; ++i) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[%s:%d] [%d] func:%s\n",
|
"[%s:%05d] [%d] func:%s\n",
|
||||||
host,
|
host,
|
||||||
(int) pid,
|
(int) pid,
|
||||||
i,
|
i,
|
||||||
@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Should we wait for a while before aborting? */
|
/* Wait for a while before aborting */
|
||||||
|
opal_delay_abort();
|
||||||
if (0 != opal_abort_delay) {
|
|
||||||
if (opal_abort_delay < 0) {
|
|
||||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
|
||||||
host, (int) pid);
|
|
||||||
fflush(stderr);
|
|
||||||
while (1) {
|
|
||||||
sleep(5);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
|
||||||
host, (int) pid, opal_abort_delay);
|
|
||||||
do {
|
|
||||||
sleep(1);
|
|
||||||
} while (--opal_abort_delay > 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!orte_initialized || !oshmem_shmem_initialized) {
|
if (!orte_initialized || !oshmem_shmem_initialized) {
|
||||||
if (orte_show_help_is_available()) {
|
if (orte_show_help_is_available()) {
|
||||||
@ -124,7 +110,7 @@ int oshmem_shmem_abort(int errcode)
|
|||||||
(int) pid);
|
(int) pid);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[%s:%d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
"[%s:%05d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||||
host,
|
host,
|
||||||
(int) pid);
|
(int) pid);
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user