1
1

Merge pull request #3678 from kawashima-fj/pr/signal-abort-delay

Apply `opal_abort_delay` to the OPAL signal handler
Этот коммит содержится в:
KAWASHIMA Takahiro 2017-06-12 10:35:11 +09:00 коммит произвёл GitHub
родитель c4971cf267 362445d486
Коммит b5b6b22848
7 изменённых файлов: 65 добавлений и 42 удалений

Просмотреть файл

@ -193,7 +193,7 @@ static void backend_fatal_aggregate(char *type,
arg = va_arg(arglist, char*);
va_end(arglist);
if (asprintf(&prefix, "[%s:%d]",
if (asprintf(&prefix, "[%s:%05d]",
ompi_process_info.nodename,
(int) ompi_process_info.pid) == -1) {
prefix = NULL;

Просмотреть файл

@ -18,6 +18,7 @@
* reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,6 +43,7 @@
#include <errno.h>
#include "opal/mca/backtrace/backtrace.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#include "ompi/communicator/communicator.h"
@ -146,7 +148,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
fprintf(stderr, "[%s:%05d] [%d] func:%s\n", host, (int) pid,
i, messages[i]);
fflush(stderr);
}
@ -159,29 +161,13 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
/* Wait for a while before aborting */
opal_delay_abort();
/* If the RTE isn't setup yet/any more, then don't even try
killing everyone. Sorry, Charlie... */
if (!ompi_rte_initialized) {
fprintf(stderr, "[%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
fprintf(stderr, "[%s:%05d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
host, (int) pid, ompi_mpi_finalized ?
"after MPI_FINALIZE started" : "before MPI_INIT completed");
_exit(errcode == 0 ? 1 : errcode);

Просмотреть файл

@ -65,7 +65,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
if (rc != OPAL_SUCCESS) {
if (from_alloc) {
int len;
len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in "
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
"will now abort.\n", opal_proc_local_get()->proc_hostname,
getpid(), base, (unsigned long) size);

Просмотреть файл

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,9 +28,12 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "opal/util/error.h"
#include "opal/constants.h"
#include "opal/util/proc.h"
#include "opal/runtime/opal_params.h"
#define MAX_CONVERTERS 5
#define MAX_CONVERTER_PROJECT_LEN 10
@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
return OPAL_ERR_OUT_OF_RESOURCE;
}
void
opal_delay_abort(void)
{
// Though snprintf and strlen are not guaranteed to be async-signal-safe
// in POSIX, it is async-signal-safe on many implementations probably.
if (0 != opal_abort_delay) {
int delay = opal_abort_delay;
pid_t pid = getpid();
char msg[100 + OPAL_MAXHOSTNAMELEN];
if (delay < 0) {
snprintf(msg, sizeof(msg),
"[%s:%05d] Looping forever "
"(MCA parameter opal_abort_delay is < 0)\n",
opal_process_info.nodename, (int) pid);
write(STDERR_FILENO, msg, strlen(msg) + 1);
while (1) {
sleep(5);
}
} else {
snprintf(msg, sizeof(msg),
"[%s:%05d] Delaying for %d seconds before aborting\n",
opal_process_info.nodename, (int) pid, delay);
write(STDERR_FILENO, msg, strlen(msg) + 1);
do {
sleep(1);
} while (--delay > 0);
}
}
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
int err_base, int err_max,
opal_err2str_fn_t converter);
/**
* Print a message and sleep in accordance with the opal_abort_delay value
*
* This function is (almost) async-thread-safe so it can be called from
* a signal handler.
*/
OPAL_DECLSPEC void opal_delay_abort(void);
END_C_DECLS
#endif /* OPAL_UTIL_ERROR_H */

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -45,6 +46,7 @@
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/proc.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#ifndef _NSIG
@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
opal_stacktrace_output_fileno = -1;
}
/* wait for a while before aborting for debugging */
opal_delay_abort();
/* Raise the signal again, so we don't accidentally mask critical signals.
* For critical signals, it is preferred that we call 'raise' instead of
* 'exit' or 'abort' so that the return status is set properly for this

Просмотреть файл

@ -1,6 +1,7 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -24,6 +25,7 @@
#endif
#include "opal/mca/backtrace/backtrace.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#include "orte/util/proc_info.h"
@ -79,7 +81,7 @@ int oshmem_shmem_abort(int errcode)
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr,
"[%s:%d] [%d] func:%s\n",
"[%s:%05d] [%d] func:%s\n",
host,
(int) pid,
i,
@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
/* Wait for a while before aborting */
opal_delay_abort();
if (!orte_initialized || !oshmem_shmem_initialized) {
if (orte_show_help_is_available()) {
@ -124,7 +110,7 @@ int oshmem_shmem_abort(int errcode)
(int) pid);
} else {
fprintf(stderr,
"[%s:%d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
"[%s:%05d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
host,
(int) pid);
}