opal: Add opal_abort_print_stack mca variable with aliases for ompi/oshmem
This commit allows to control output during abnormal oshmem/ompi application termination. Fixed issue in backtrace output. HAVE_BACKTRACE was never set so user was limited in control of this variable. Two related mca variables are moved to opal layer. Corresponding aliases are added for ompi and oshmem.
Этот коммит содержится в:
родитель
ab70ca6d16
Коммит
c15bf147bf
@ -16,6 +16,8 @@
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,6 +42,7 @@
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
@ -137,11 +140,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
|
||||
/* Should we print a stack trace? Not aggregated because they
|
||||
might be different on all processes. */
|
||||
if (ompi_mpi_abort_print_stack) {
|
||||
if (opal_abort_print_stack) {
|
||||
char **messages;
|
||||
int len, i;
|
||||
|
||||
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
|
||||
i, messages[i]);
|
||||
@ -161,7 +164,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
if (errcode < 0 ||
|
||||
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
|
||||
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
|
||||
ompi_mpi_abort_print_stack ?
|
||||
opal_abort_print_stack ?
|
||||
" (stack trace available on stderr)" : "") < 0) {
|
||||
msg = NULL;
|
||||
}
|
||||
@ -172,9 +175,9 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != ompi_mpi_abort_delay) {
|
||||
if (ompi_mpi_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
@ -182,10 +185,10 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, ompi_mpi_abort_delay);
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--ompi_mpi_abort_delay > 0);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,8 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,8 +55,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
|
||||
bool ompi_debug_no_free_handles = false;
|
||||
bool ompi_mpi_show_mca_params = false;
|
||||
char *ompi_mpi_show_mca_params_file = NULL;
|
||||
bool ompi_mpi_abort_print_stack = false;
|
||||
int ompi_mpi_abort_delay = 0;
|
||||
bool ompi_mpi_keep_fqdn_hostnames = false;
|
||||
bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
|
||||
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
|
||||
@ -206,33 +206,6 @@ int ompi_mpi_register_params(void)
|
||||
|
||||
/* User-level process pinning controls */
|
||||
|
||||
/* MPI_ABORT controls */
|
||||
ompi_mpi_abort_delay = 0;
|
||||
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
|
||||
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mpi_abort_delay);
|
||||
|
||||
ompi_mpi_abort_print_stack = false;
|
||||
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
|
||||
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||
/* If we do not have stack trace
|
||||
capability, make this a constant
|
||||
MCA variable */
|
||||
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
|
||||
0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
#else
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
#endif
|
||||
&ompi_mpi_abort_print_stack);
|
||||
|
||||
ompi_mpi_preconnect_mpi = false;
|
||||
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
|
||||
"Whether to force MPI processes to fully "
|
||||
@ -307,6 +280,18 @@ int ompi_mpi_register_params(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mpi_dynamics_enabled);
|
||||
|
||||
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
|
||||
if (0 <= value) {
|
||||
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
|
||||
if (0 <= value) {
|
||||
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_print_stack",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,8 @@
|
||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -65,6 +67,8 @@ bool opal_base_distill_checkpoint_ready = false;
|
||||
*/
|
||||
int opal_leave_pinned = -1;
|
||||
bool opal_leave_pinned_pipeline = false;
|
||||
bool opal_abort_print_stack = false;
|
||||
int opal_abort_delay = 0;
|
||||
|
||||
static bool opal_register_done = false;
|
||||
|
||||
@ -280,6 +284,38 @@ int opal_register_params(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&opal_warn_on_fork);
|
||||
|
||||
opal_abort_delay = 0;
|
||||
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
|
||||
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&opal_abort_delay);
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_abort_print_stack = false;
|
||||
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
|
||||
"If nonzero, print out a stack trace when abort is invoked",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||
/* If we do not have stack trace
|
||||
capability, make this a constant
|
||||
MCA variable */
|
||||
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
|
||||
0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
#else
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
#endif
|
||||
&opal_abort_print_stack);
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The ddt engine has a few parameters */
|
||||
ret = opal_datatype_register_params();
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
|
@ -16,6 +16,8 @@
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -55,6 +57,22 @@ OPAL_DECLSPEC extern int opal_leave_pinned;
|
||||
*/
|
||||
OPAL_DECLSPEC extern bool opal_leave_pinned_pipeline;
|
||||
|
||||
/**
|
||||
* Whether an abort operation should print out a stack trace or not.
|
||||
*/
|
||||
OPAL_DECLSPEC extern bool opal_abort_print_stack;
|
||||
|
||||
/**
|
||||
* Whether abort operation should print out an identifying message
|
||||
* (e.g., hostname and PID) and loop waiting for a debugger to
|
||||
* attach. The value of the integer is how many seconds to wait:
|
||||
*
|
||||
* 0 = do not print the message and do not loop
|
||||
* negative value = print the message and loop forever
|
||||
* positive value = print the message and delay for that many seconds
|
||||
*/
|
||||
OPAL_DECLSPEC extern int opal_abort_delay;
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
extern bool opal_progress_debug;
|
||||
#endif
|
||||
|
@ -24,6 +24,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -71,11 +72,11 @@ int oshmem_shmem_abort(int errcode)
|
||||
|
||||
/* Should we print a stack trace? Not aggregated because they
|
||||
might be different on all processes. */
|
||||
if (ompi_mpi_abort_print_stack) {
|
||||
if (opal_abort_print_stack) {
|
||||
char **messages;
|
||||
int len, i;
|
||||
|
||||
if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr,
|
||||
"[%s:%d] [%d] func:%s\n",
|
||||
@ -94,6 +95,25 @@ int oshmem_shmem_abort(int errcode)
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
|
||||
if (0 != opal_abort_delay) {
|
||||
if (opal_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, opal_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--opal_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!orte_initialized || !oshmem_shmem_initialized) {
|
||||
if (orte_show_help_is_available()) {
|
||||
/* TODO help message from SHMEM not from MPI is needed*/
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -8,8 +8,12 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "params.h"
|
||||
#include "runtime.h"
|
||||
#include "oshmem_config.h"
|
||||
|
||||
#include "opal/runtime/opal_params.h"
|
||||
|
||||
#include "oshmem/runtime/params.h"
|
||||
#include "oshmem/runtime/runtime.h"
|
||||
#include "oshmem/constants.h"
|
||||
|
||||
|
||||
@ -19,6 +23,8 @@ int oshmem_preconnect_all = 0;
|
||||
|
||||
int oshmem_shmem_register_params(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
(void) mca_base_var_register("oshmem",
|
||||
"oshmem",
|
||||
NULL,
|
||||
@ -63,5 +69,17 @@ int oshmem_shmem_register_params(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&oshmem_preconnect_all);
|
||||
|
||||
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
|
||||
if (0 <= value) {
|
||||
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_delay",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
|
||||
if (0 <= value) {
|
||||
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_print_stack",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -19,10 +19,6 @@ BEGIN_C_DECLS
|
||||
* Global variables
|
||||
*/
|
||||
|
||||
/**
|
||||
* Whether an MPI_ABORT should print out a stack trace or not.
|
||||
*/
|
||||
OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack;
|
||||
|
||||
/**
|
||||
* Whether or not the lock routines are recursive
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user