1
1

Merge pull request #1151 from igor-ivanov/pr/opal-abort-vars

Add new mca variables opal_abort_delay and opal_abort_print_stack
Этот коммит содержится в:
igor-ivanov 2015-12-01 16:27:11 +04:00
родитель 324534b191 c15bf147bf
Коммит d8c85738ab
7 изменённых файлов: 108 добавлений и 86 удалений

Просмотреть файл

@ -16,6 +16,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -40,6 +42,7 @@
#include <errno.h>
#include "opal/mca/backtrace/backtrace.h"
#include "opal/runtime/opal_params.h"
#include "ompi/communicator/communicator.h"
#include "ompi/runtime/mpiruntime.h"
@ -137,11 +140,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (ompi_mpi_abort_print_stack) {
if (opal_abort_print_stack) {
char **messages;
int len, i;
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
i, messages[i]);
@ -161,7 +164,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (errcode < 0 ||
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
ompi_mpi_abort_print_stack ?
opal_abort_print_stack ?
" (stack trace available on stderr)" : "") < 0) {
msg = NULL;
}
@ -172,9 +175,9 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* Should we wait for a while before aborting? */
if (0 != ompi_mpi_abort_delay) {
if (ompi_mpi_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
@ -182,10 +185,10 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, ompi_mpi_abort_delay);
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--ompi_mpi_abort_delay > 0);
} while (--opal_abort_delay > 0);
}
}

Просмотреть файл

@ -15,6 +15,8 @@
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,8 +55,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
bool ompi_debug_no_free_handles = false;
bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_fqdn_hostnames = false;
bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
@ -206,33 +206,6 @@ int ompi_mpi_register_params(void)
/* User-level process pinning controls */
/* MPI_ABORT controls */
ompi_mpi_abort_delay = 0;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_abort_delay);
ompi_mpi_abort_print_stack = false;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
/* If we do not have stack trace
capability, make this a constant
MCA variable */
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
#else
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
#endif
&ompi_mpi_abort_print_stack);
ompi_mpi_preconnect_mpi = false;
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
"Whether to force MPI processes to fully "
@ -307,6 +280,18 @@ int ompi_mpi_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_dynamics_enabled);
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_print_stack",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -19,6 +19,8 @@
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -65,6 +67,8 @@ bool opal_base_distill_checkpoint_ready = false;
*/
int opal_leave_pinned = -1;
bool opal_leave_pinned_pipeline = false;
bool opal_abort_print_stack = false;
int opal_abort_delay = 0;
static bool opal_register_done = false;
@ -280,6 +284,38 @@ int opal_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&opal_warn_on_fork);
opal_abort_delay = 0;
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&opal_abort_delay);
if (0 > ret) {
return ret;
}
opal_abort_print_stack = false;
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
"If nonzero, print out a stack trace when abort is invoked",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
/* If we do not have stack trace
capability, make this a constant
MCA variable */
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
#else
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
#endif
&opal_abort_print_stack);
if (0 > ret) {
return ret;
}
/* The ddt engine has a few parameters */
ret = opal_datatype_register_params();
if (OPAL_SUCCESS != ret) {

Просмотреть файл

@ -16,6 +16,8 @@
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -55,6 +57,22 @@ OPAL_DECLSPEC extern int opal_leave_pinned;
*/
OPAL_DECLSPEC extern bool opal_leave_pinned_pipeline;
/**
* Whether an abort operation should print out a stack trace or not.
*/
OPAL_DECLSPEC extern bool opal_abort_print_stack;
/**
* Whether abort operation should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to
* attach. The value of the integer is how many seconds to wait:
*
* 0 = do not print the message and do not loop
* negative value = print the message and loop forever
* positive value = print the message and delay for that many seconds
*/
OPAL_DECLSPEC extern int opal_abort_delay;
#if OPAL_ENABLE_DEBUG
extern bool opal_progress_debug;
#endif

Просмотреть файл

@ -24,6 +24,7 @@
#endif
#include "opal/mca/backtrace/backtrace.h"
#include "opal/runtime/opal_params.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h"
@ -71,7 +72,7 @@ int oshmem_shmem_abort(int errcode)
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (oshmem_shmem_abort_print_stack) {
if (opal_abort_print_stack) {
char **messages;
int len, i;
@ -95,9 +96,10 @@ int oshmem_shmem_abort(int errcode)
}
/* Should we wait for a while before aborting? */
if (0 != oshmem_shmem_abort_delay) {
if (oshmem_shmem_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
@ -105,10 +107,10 @@ int oshmem_shmem_abort(int errcode)
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, oshmem_shmem_abort_delay);
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--oshmem_shmem_abort_delay > 0);
} while (--opal_abort_delay > 0);
}
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
@ -8,41 +8,22 @@
* $HEADER$
*/
#include "params.h"
#include "runtime.h"
#include "oshmem_config.h"
#include "opal/runtime/opal_params.h"
#include "oshmem/runtime/params.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/constants.h"
bool oshmem_shmem_abort_print_stack = false;
int oshmem_shmem_abort_delay = 0;
int oshmem_shmem_lock_recursive = 0;
int oshmem_shmem_api_verbose = 0;
int oshmem_preconnect_all = 0;
int oshmem_shmem_register_params(void)
{
oshmem_shmem_abort_delay = 0;
(void) mca_base_var_register("oshmem",
"oshmem",
NULL,
"abort_delay",
"If nonzero, print out an identifying message when abort is invoked (hostname, PID of the process that called abort operation) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_shmem_abort_delay);
oshmem_shmem_abort_print_stack = false;
(void) mca_base_var_register("oshmem",
"oshmem",
NULL,
"abort_print_stack",
"If nonzero, print out a stack trace when abort is invoked",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_shmem_abort_print_stack);
int value;
(void) mca_base_var_register("oshmem",
"oshmem",
@ -88,5 +69,17 @@ int oshmem_shmem_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_preconnect_all);
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_delay",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
if (0 <= value) {
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_print_stack",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OSHMEM_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
@ -19,21 +19,6 @@ BEGIN_C_DECLS
* Global variables
*/
/**
* Whether an abort should print out a stack trace or not.
*/
OSHMEM_DECLSPEC extern bool oshmem_shmem_abort_print_stack;
/**
* Whether abort should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to
* attach. The value of the integer is how many seconds to wait:
*
* 0 = do not print the message and do not loop
* negative value = print the message and loop forever
* positive value = print the message and delay for that many seconds
*/
OSHMEM_DECLSPEC extern int oshmem_shmem_abort_delay;
/**
* Whether or not the lock routines are recursive