Merge pull request #3678 from kawashima-fj/pr/signal-abort-delay
Apply `opal_abort_delay` to the OPAL signal handler
Этот коммит содержится в:
Коммит
b5b6b22848
@ -193,7 +193,7 @@ static void backend_fatal_aggregate(char *type,
|
||||
arg = va_arg(arglist, char*);
|
||||
va_end(arglist);
|
||||
|
||||
if (asprintf(&prefix, "[%s:%d]",
|
||||
if (asprintf(&prefix, "[%s:%05d]",
|
||||
ompi_process_info.nodename,
|
||||
(int) ompi_process_info.pid) == -1) {
|
||||
prefix = NULL;
|
||||
|
@ -18,6 +18,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -42,6 +43,7 @@
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "ompi/communicator/communicator.h"
|
||||
@ -146,7 +148,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
|
||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
|
||||
fprintf(stderr, "[%s:%05d] [%d] func:%s\n", host, (int) pid,
|
||||
i, messages[i]);
|
||||
fflush(stderr);
|
||||
}
|
||||
@ -159,29 +161,13 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
/* Wait for a while before aborting */
|
||||
opal_delay_abort();
|
||||
|
||||
/* If the RTE isn't setup yet/any more, then don't even try
|
||||
killing everyone. Sorry, Charlie... */
|
||||
if (!ompi_rte_initialized) {
|
||||
fprintf(stderr, "[%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
fprintf(stderr, "[%s:%05d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
host, (int) pid, ompi_mpi_finalized ?
|
||||
"after MPI_FINALIZE started" : "before MPI_INIT completed");
|
||||
_exit(errcode == 0 ? 1 : errcode);
|
||||
|
@ -65,7 +65,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
if (from_alloc) {
|
||||
int len;
|
||||
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in "
|
||||
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
|
||||
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
||||
"will now abort.\n", opal_proc_local_get()->proc_hostname,
|
||||
getpid(), base, (unsigned long) size);
|
||||
|
@ -14,6 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,9 +28,12 @@
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#define MAX_CONVERTERS 5
|
||||
#define MAX_CONVERTER_PROJECT_LEN 10
|
||||
@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
|
||||
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
opal_delay_abort(void)
|
||||
{
|
||||
// Though snprintf and strlen are not guaranteed to be async-signal-safe
|
||||
// in POSIX, it is async-signal-safe on many implementations probably.
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
int delay = opal_abort_delay;
|
||||
pid_t pid = getpid();
|
||||
char msg[100 + OPAL_MAXHOSTNAMELEN];
|
||||
|
||||
if (delay < 0) {
|
||||
snprintf(msg, sizeof(msg),
|
||||
"[%s:%05d] Looping forever "
|
||||
"(MCA parameter opal_abort_delay is < 0)\n",
|
||||
opal_process_info.nodename, (int) pid);
|
||||
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
snprintf(msg, sizeof(msg),
|
||||
"[%s:%05d] Delaying for %d seconds before aborting\n",
|
||||
opal_process_info.nodename, (int) pid, delay);
|
||||
write(STDERR_FILENO, msg, strlen(msg) + 1);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--delay > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
|
||||
int err_base, int err_max,
|
||||
opal_err2str_fn_t converter);
|
||||
|
||||
/**
|
||||
* Print a message and sleep in accordance with the opal_abort_delay value
|
||||
*
|
||||
* This function is (almost) async-thread-safe so it can be called from
|
||||
* a signal handler.
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_delay_abort(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OPAL_UTIL_ERROR_H */
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -45,6 +46,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#ifndef _NSIG
|
||||
@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
|
||||
opal_stacktrace_output_fileno = -1;
|
||||
}
|
||||
|
||||
/* wait for a while before aborting for debugging */
|
||||
opal_delay_abort();
|
||||
|
||||
/* Raise the signal again, so we don't accidentally mask critical signals.
|
||||
* For critical signals, it is preferred that we call 'raise' instead of
|
||||
* 'exit' or 'abort' so that the return status is set properly for this
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,6 +25,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -79,7 +81,7 @@ int oshmem_shmem_abort(int errcode)
|
||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr,
|
||||
"[%s:%d] [%d] func:%s\n",
|
||||
"[%s:%05d] [%d] func:%s\n",
|
||||
host,
|
||||
(int) pid,
|
||||
i,
|
||||
@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
/* Wait for a while before aborting */
|
||||
opal_delay_abort();
|
||||
|
||||
if (!orte_initialized || !oshmem_shmem_initialized) {
|
||||
if (orte_show_help_is_available()) {
|
||||
@ -124,7 +110,7 @@ int oshmem_shmem_abort(int errcode)
|
||||
(int) pid);
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"[%s:%d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
"[%s:%05d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
host,
|
||||
(int) pid);
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user