1
1

opal: Add opal_abort_print_stack mca variable with aliases for ompi/oshmem

This commit allows to control output during abnormal oshmem/ompi application
termination.
Fixed issue in backtrace output. HAVE_BACKTRACE was never set so user was limited
in control of this variable.
Two related mca variables are moved to opal layer. Corresponding aliases are
added for ompi and oshmem.
Этот коммит содержится в:
igor.ivanov@itseez.com 2015-11-25 15:22:52 +03:00 коммит произвёл Igor Ivanov
родитель ab70ca6d16
Коммит c15bf147bf
7 изменённых файлов: 123 добавлений и 47 удалений

Просмотреть файл

@ -16,6 +16,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -40,6 +42,7 @@
#include <errno.h>
#include "opal/mca/backtrace/backtrace.h"
#include "opal/runtime/opal_params.h"
#include "ompi/communicator/communicator.h"
#include "ompi/runtime/mpiruntime.h"
@ -137,11 +140,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (ompi_mpi_abort_print_stack) {
if (opal_abort_print_stack) {
char **messages;
int len, i;
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
i, messages[i]);
@ -161,7 +164,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (errcode < 0 ||
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
ompi_mpi_abort_print_stack ?
opal_abort_print_stack ?
" (stack trace available on stderr)" : "") < 0) {
msg = NULL;
}
@ -172,9 +175,9 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* Should we wait for a while before aborting? */
if (0 != ompi_mpi_abort_delay) {
if (ompi_mpi_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
@ -182,10 +185,10 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, ompi_mpi_abort_delay);
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--ompi_mpi_abort_delay > 0);
} while (--opal_abort_delay > 0);
}
}

Просмотреть файл

@ -15,6 +15,8 @@
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,8 +55,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
bool ompi_debug_no_free_handles = false;
bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_fqdn_hostnames = false;
bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
@ -206,33 +206,6 @@ int ompi_mpi_register_params(void)
/* User-level process pinning controls */
/* MPI_ABORT controls */
ompi_mpi_abort_delay = 0;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_abort_delay);
ompi_mpi_abort_print_stack = false;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
/* If we do not have stack trace
capability, make this a constant
MCA variable */
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
#else
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
#endif
&ompi_mpi_abort_print_stack);
ompi_mpi_preconnect_mpi = false;
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
"Whether to force MPI processes to fully "
@ -307,6 +280,18 @@ int ompi_mpi_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_dynamics_enabled);
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_print_stack",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -19,6 +19,8 @@
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -65,6 +67,8 @@ bool opal_base_distill_checkpoint_ready = false;
*/
int opal_leave_pinned = -1;
bool opal_leave_pinned_pipeline = false;
bool opal_abort_print_stack = false;
int opal_abort_delay = 0;
static bool opal_register_done = false;
@ -280,6 +284,38 @@ int opal_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&opal_warn_on_fork);
opal_abort_delay = 0;
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&opal_abort_delay);
if (0 > ret) {
return ret;
}
opal_abort_print_stack = false;
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
"If nonzero, print out a stack trace when abort is invoked",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
/* If we do not have stack trace
capability, make this a constant
MCA variable */
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
#else
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
#endif
&opal_abort_print_stack);
if (0 > ret) {
return ret;
}
/* The ddt engine has a few parameters */
ret = opal_datatype_register_params();
if (OPAL_SUCCESS != ret) {

Просмотреть файл

@ -16,6 +16,8 @@
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -55,6 +57,22 @@ OPAL_DECLSPEC extern int opal_leave_pinned;
*/
OPAL_DECLSPEC extern bool opal_leave_pinned_pipeline;
/**
* Whether an abort operation should print out a stack trace or not.
*/
OPAL_DECLSPEC extern bool opal_abort_print_stack;
/**
* Whether abort operation should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to
* attach. The value of the integer is how many seconds to wait:
*
* 0 = do not print the message and do not loop
* negative value = print the message and loop forever
* positive value = print the message and delay for that many seconds
*/
OPAL_DECLSPEC extern int opal_abort_delay;
#if OPAL_ENABLE_DEBUG
extern bool opal_progress_debug;
#endif

Просмотреть файл

@ -24,6 +24,7 @@
#endif
#include "opal/mca/backtrace/backtrace.h"
#include "opal/runtime/opal_params.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h"
@ -71,11 +72,11 @@ int oshmem_shmem_abort(int errcode)
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (ompi_mpi_abort_print_stack) {
if (opal_abort_print_stack) {
char **messages;
int len, i;
if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr,
"[%s:%d] [%d] func:%s\n",
@ -94,6 +95,25 @@ int oshmem_shmem_abort(int errcode)
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
if (!orte_initialized || !oshmem_shmem_initialized) {
if (orte_show_help_is_available()) {
/* TODO help message from SHMEM not from MPI is needed*/

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
@ -8,8 +8,12 @@
* $HEADER$
*/
#include "params.h"
#include "runtime.h"
#include "oshmem_config.h"
#include "opal/runtime/opal_params.h"
#include "oshmem/runtime/params.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/constants.h"
@ -19,6 +23,8 @@ int oshmem_preconnect_all = 0;
int oshmem_shmem_register_params(void)
{
int value;
(void) mca_base_var_register("oshmem",
"oshmem",
NULL,
@ -63,5 +69,17 @@ int oshmem_shmem_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_preconnect_all);
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_delay",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_print_stack",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OSHMEM_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
@ -19,10 +19,6 @@ BEGIN_C_DECLS
* Global variables
*/
/**
* Whether an MPI_ABORT should print out a stack trace or not.
*/
OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack;
/**
* Whether or not the lock routines are recursive