1
1

Apply opal_abort_delay to the signal handler

This commit expands the effect of the MCA parameter `opal_abort_delay`
to the OPAL signal handler. This allows attaching of a debugger on
segmentation fault etc. before quitting the job.

The sleep code is moved to the `opal_delay_abort` function from the
`ompi_mpi_abort` and `oshmem_shmem_abort` functions for code cleanup.

Signed-off-by: KAWASHIMA Takahiro <t-kawashima@jp.fujitsu.com>
Этот коммит содержится в:
KAWASHIMA Takahiro 2017-06-07 17:16:45 +09:00
родитель 7002535059
Коммит 6b91eddc8b
5 изменённых файлов: 59 добавлений и 36 удалений

Просмотреть файл

@ -18,6 +18,7 @@
* reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,6 +43,7 @@
#include <errno.h>
#include "opal/mca/backtrace/backtrace.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#include "ompi/communicator/communicator.h"
@ -159,24 +161,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
/* Wait for a while before aborting */
opal_delay_abort();
/* If the RTE isn't setup yet/any more, then don't even try
killing everyone. Sorry, Charlie... */

Просмотреть файл

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,9 +28,12 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "opal/util/error.h"
#include "opal/constants.h"
#include "opal/util/proc.h"
#include "opal/runtime/opal_params.h"
#define MAX_CONVERTERS 5
#define MAX_CONVERTER_PROJECT_LEN 10
@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
return OPAL_ERR_OUT_OF_RESOURCE;
}
void
opal_delay_abort(void)
{
// Though snprintf and strlen are not guaranteed to be async-signal-safe
// in POSIX, it is async-signal-safe on many implementations probably.
if (0 != opal_abort_delay) {
int delay = opal_abort_delay;
pid_t pid = getpid();
char msg[100 + OPAL_MAXHOSTNAMELEN];
if (delay < 0) {
snprintf(msg, sizeof(msg),
"[%s:%05d] Looping forever "
"(MCA parameter opal_abort_delay is < 0)\n",
opal_process_info.nodename, (int) pid);
write(STDERR_FILENO, msg, strlen(msg) + 1);
while (1) {
sleep(5);
}
} else {
snprintf(msg, sizeof(msg),
"[%s:%05d] Delaying for %d seconds before aborting\n",
opal_process_info.nodename, (int) pid, delay);
write(STDERR_FILENO, msg, strlen(msg) + 1);
do {
sleep(1);
} while (--delay > 0);
}
}
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
int err_base, int err_max,
opal_err2str_fn_t converter);
/**
* Print a message and sleep in accordance with the opal_abort_delay value
*
* This function is (almost) async-thread-safe so it can be called from
* a signal handler.
*/
OPAL_DECLSPEC void opal_delay_abort(void);
END_C_DECLS
#endif /* OPAL_UTIL_ERROR_H */

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -45,6 +46,7 @@
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/proc.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#ifndef _NSIG
@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
opal_stacktrace_output_fileno = -1;
}
/* wait for a while before aborting for debugging */
opal_delay_abort();
/* Raise the signal again, so we don't accidentally mask critical signals.
* For critical signals, it is preferred that we call 'raise' instead of
* 'exit' or 'abort' so that the return status is set properly for this

Просмотреть файл

@ -1,6 +1,7 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -24,6 +25,7 @@
#endif
#include "opal/mca/backtrace/backtrace.h"
#include "opal/util/error.h"
#include "opal/runtime/opal_params.h"
#include "orte/util/proc_info.h"
@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
/* Wait for a while before aborting */
opal_delay_abort();
if (!orte_initialized || !oshmem_shmem_initialized) {
if (orte_show_help_is_available()) {