ess/hnp: add support for forwarding additional signals (#2712)
* ess/hnp: add support for forwarding additional signals This commit adds support to the hnp ess module to forward additional signals beyond the default SIGUSR1, SIGUSR2, SIGSTP, and SIGCONT. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> * Generalize this a bit to allow a broader range of signals to be forwarded. Turns out that SIGURG is now a "standard" signal, though the value differs across systems. So setup to forward it (and some friends) if they are defined. Allow users to provide the signal name (instead of the integer value) as the value of even the more common signals does vary across systems. Don't limit the number that can be supported. Signed-off-by: Ralph Castain <rhc@open-mpi.org> * ess/hnp: fix some bugs in the signal forwarding code This commit fixes two bugs: - signals_set needs to be set even if no signals are being forwarded. If it is not set we will SEGV in libevent if ess_hnp_forward_signals == none. - SIGTERM and SIGHUP are handled with a different type of handler. Do not allow the user to specify these to be forwarded. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> * We are sure to get "dinged" if error messages aren't nicely output via show_help, so do so here Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
91c34c8df6
Коммит
110840fc87
@ -10,6 +10,9 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
# reseved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -17,6 +20,8 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ortedata_DATA = help-ess-hnp.txt
|
||||
|
||||
sources = \
|
||||
ess_hnp.h \
|
||||
ess_hnp_component.c \
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -9,6 +10,9 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,12 +28,19 @@ BEGIN_C_DECLS
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_ess_hnp_component_open(void);
|
||||
int orte_ess_hnp_component_close(void);
|
||||
int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority);
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
char *signame;
|
||||
int signal;
|
||||
} ess_hnp_signal_t;
|
||||
OBJ_CLASS_DECLARATION(ess_hnp_signal_t);
|
||||
|
||||
typedef struct {
|
||||
orte_ess_base_component_t base;
|
||||
opal_list_t signals;
|
||||
} orte_ess_hnp_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_hnp_component;
|
||||
ORTE_MODULE_DECLSPEC extern orte_ess_hnp_component_t mca_ess_hnp_component;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -10,8 +10,9 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,45 +30,200 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/hnp/ess_hnp.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
extern orte_ess_base_module_t orte_ess_hnp_module;
|
||||
static int hnp_component_register (void);
|
||||
static int hnp_component_open(void);
|
||||
static int hnp_component_close(void);
|
||||
static int hnp_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
struct known_signal {
|
||||
/** signal number */
|
||||
int signal;
|
||||
/** signal name */
|
||||
char *signame;
|
||||
/** can this signal be forwarded */
|
||||
bool can_forward;
|
||||
};
|
||||
|
||||
static struct known_signal known_signals[] = {
|
||||
{SIGTERM, "SIGTERM", false},
|
||||
{SIGHUP, "SIGHUP", false},
|
||||
{SIGINT, "SIGINT", false},
|
||||
{SIGKILL, "SIGKILL", false},
|
||||
#ifdef SIGSYS
|
||||
{SIGSYS, "SIGSYS", true},
|
||||
#endif
|
||||
#ifdef SIGXCPU
|
||||
{SIGXCPU, "SIGXCPU", true},
|
||||
#endif
|
||||
{SIGXFSZ, "SIGXFSZ", true},
|
||||
#ifdef SIGVTALRM
|
||||
{SIGVTALRM, "SIGVTALRM", true},
|
||||
#endif
|
||||
#ifdef SIGPROF
|
||||
{SIGPROF, "SIGPROF", true},
|
||||
#endif
|
||||
#ifdef SIGINFO
|
||||
{SIGINFO, "SIGINFO", true},
|
||||
#endif
|
||||
#ifdef SIGPWR
|
||||
{SIGPWR, "SIGPWR", true},
|
||||
#endif
|
||||
{0, NULL},
|
||||
};
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
orte_ess_base_component_t mca_ess_hnp_component = {
|
||||
.base_version = {
|
||||
ORTE_ESS_BASE_VERSION_3_0_0,
|
||||
orte_ess_hnp_component_t mca_ess_hnp_component = {
|
||||
.base = {
|
||||
.base_version = {
|
||||
ORTE_ESS_BASE_VERSION_3_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
.mca_component_name = "hnp",
|
||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION),
|
||||
/* Component name and version */
|
||||
.mca_component_name = "hnp",
|
||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = orte_ess_hnp_component_open,
|
||||
.mca_close_component = orte_ess_hnp_component_close,
|
||||
.mca_query_component = orte_ess_hnp_component_query,
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = hnp_component_open,
|
||||
.mca_close_component = hnp_component_close,
|
||||
.mca_query_component = hnp_component_query,
|
||||
.mca_register_component_params = hnp_component_register,
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static char *additional_signals;
|
||||
|
||||
int
|
||||
orte_ess_hnp_component_open(void)
|
||||
static int hnp_component_register (void)
|
||||
{
|
||||
additional_signals = NULL;
|
||||
(void) mca_base_component_var_register (&mca_ess_hnp_component.base.base_version,
|
||||
"forward_signals", "Comma-delimited list "
|
||||
"of additional signals (names or integers) to forward to "
|
||||
"application processes [\"none\" => forward nothing]", MCA_BASE_VAR_TYPE_STRING,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&additional_signals);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#define ESS_ADDSIGNAL(x, s) \
|
||||
do { \
|
||||
ess_hnp_signal_t *_sig; \
|
||||
_sig = OBJ_NEW(ess_hnp_signal_t); \
|
||||
_sig->signal = (x); \
|
||||
_sig->signame = strdup((s)); \
|
||||
opal_list_append(&mca_ess_hnp_component.signals, &_sig->super); \
|
||||
} while(0)
|
||||
|
||||
static int hnp_component_open(void)
|
||||
{
|
||||
int i, sval;
|
||||
char **signals, *tmp;
|
||||
ess_hnp_signal_t *sig;
|
||||
bool ignore, found;
|
||||
|
||||
OBJ_CONSTRUCT(&mca_ess_hnp_component.signals, opal_list_t);
|
||||
|
||||
/* we know that some signals are (nearly) always defined, regardless
|
||||
* of environment, so add them here */
|
||||
ESS_ADDSIGNAL(SIGTSTP, "SIGTSTP");
|
||||
ESS_ADDSIGNAL(SIGUSR1, "SIGUSR1");
|
||||
ESS_ADDSIGNAL(SIGUSR2, "SIGUSR2");
|
||||
ESS_ADDSIGNAL(SIGABRT, "SIGABRT");
|
||||
ESS_ADDSIGNAL(SIGALRM, "SIGALRM");
|
||||
ESS_ADDSIGNAL(SIGCONT, "SIGCONT");
|
||||
#ifdef SIGURG
|
||||
ESS_ADDSIGNAL(SIGURG, "SIGURG");
|
||||
#endif
|
||||
|
||||
/* see if they asked for anything beyond those - note that they may
|
||||
* have asked for some we already cover, and so we ignore any duplicates */
|
||||
if (NULL != additional_signals) {
|
||||
/* if they told us "none", then dump the list */
|
||||
if (0 == strcmp(additional_signals, "none")) {
|
||||
OPAL_LIST_DESTRUCT(&mca_ess_hnp_component.signals);
|
||||
/* need to reconstruct it for when we close */
|
||||
OBJ_CONSTRUCT(&mca_ess_hnp_component.signals, opal_list_t);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
signals = opal_argv_split(additional_signals, ',');
|
||||
for (i=0; NULL != signals[i]; i++) {
|
||||
sval = 0;
|
||||
if (0 != strncmp(signals[i], "SIG", 3)) {
|
||||
/* treat it like a number */
|
||||
errno = 0;
|
||||
sval = strtoul(signals[i], &tmp, 10);
|
||||
if (0 != errno || '\0' != *tmp) {
|
||||
orte_show_help("help-ess-hnp.txt", "ess-hnp:unknown-signal",
|
||||
true, signals[i], additional_signals);
|
||||
opal_argv_free(signals);
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if it is one we already covered */
|
||||
ignore = false;
|
||||
OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) {
|
||||
if (0 == strcasecmp(signals[i], sig->signame) || sval == sig->signal) {
|
||||
/* got it - we will ignore */
|
||||
ignore = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ignore) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see if they gave us a signal name */
|
||||
found = false;
|
||||
for (int j = 0 ; known_signals[j].signame ; ++j) {
|
||||
if (0 == strcasecmp (signals[i], known_signals[j].signame) || sval == known_signals[j].signal) {
|
||||
if (!known_signals[j].can_forward) {
|
||||
orte_show_help("help-ess-hnp.txt", "ess-hnp:cannot-forward",
|
||||
true, known_signals[j].signame, additional_signals);
|
||||
opal_argv_free(signals);
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
found = true;
|
||||
ESS_ADDSIGNAL(known_signals[j].signal, known_signals[j].signame);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
if (0 == strncmp(signals[i], "SIG", 3)) {
|
||||
orte_show_help("help-ess-hnp.txt", "ess-hnp:unknown-signal",
|
||||
true, signals[i], additional_signals);
|
||||
opal_argv_free(signals);
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
|
||||
ESS_ADDSIGNAL(sval, signals[i]);
|
||||
}
|
||||
}
|
||||
opal_argv_free (signals);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
static int hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
|
||||
/* we are the hnp module - we need to be selected
|
||||
@ -86,9 +242,22 @@ int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_ess_hnp_component_close(void)
|
||||
static int hnp_component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* instantiate the class */
|
||||
static void scon(ess_hnp_signal_t *t)
|
||||
{
|
||||
t->signame = NULL;
|
||||
}
|
||||
static void sdes(ess_hnp_signal_t *t)
|
||||
{
|
||||
if (NULL != t->signame) {
|
||||
free(t->signame);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(ess_hnp_signal_t,
|
||||
opal_list_item_t,
|
||||
scon, sdes);
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
@ -118,10 +119,7 @@ static bool forcibly_die=false;
|
||||
static opal_event_t term_handler;
|
||||
static opal_event_t epipe_handler;
|
||||
static int term_pipe[2];
|
||||
static opal_event_t sigusr1_handler;
|
||||
static opal_event_t sigusr2_handler;
|
||||
static opal_event_t sigtstp_handler;
|
||||
static opal_event_t sigcont_handler;
|
||||
static opal_event_t *forward_signals_events = NULL;
|
||||
|
||||
static void abort_signal_callback(int signal);
|
||||
static void clean_abort(int fd, short flags, void *arg);
|
||||
@ -151,6 +149,7 @@ static int rte_init(void)
|
||||
int idx;
|
||||
orte_topology_t *t;
|
||||
opal_list_t transports;
|
||||
ess_hnp_signal_t *sig;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -193,11 +192,20 @@ static int rte_init(void)
|
||||
signal(SIGINT, abort_signal_callback);
|
||||
signal(SIGHUP, abort_signal_callback);
|
||||
|
||||
/** setup callbacks for signals we should foward */
|
||||
setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback);
|
||||
setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback);
|
||||
setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback);
|
||||
setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback);
|
||||
/** setup callbacks for signals we should forward */
|
||||
if (0 < (idx = opal_list_get_size(&mca_ess_hnp_component.signals))) {
|
||||
forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx);
|
||||
if (NULL == forward_signals_events) {
|
||||
ret = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
error = "unable to malloc";
|
||||
goto error;
|
||||
}
|
||||
idx = 0;
|
||||
OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) {
|
||||
setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
signals_set = true;
|
||||
|
||||
/* get the local topology */
|
||||
@ -782,6 +790,8 @@ static int rte_finalize(void)
|
||||
char *contact_path;
|
||||
orte_job_t *jdata;
|
||||
uint32_t key;
|
||||
ess_hnp_signal_t *sig;
|
||||
unsigned int i;
|
||||
|
||||
if (signals_set) {
|
||||
/* Remove the epipe handler */
|
||||
@ -789,10 +799,13 @@ static int rte_finalize(void)
|
||||
/* remove the term handler */
|
||||
opal_event_del(&term_handler);
|
||||
/** Remove the USR signal handlers */
|
||||
opal_event_signal_del(&sigusr1_handler);
|
||||
opal_event_signal_del(&sigusr2_handler);
|
||||
opal_event_signal_del(&sigtstp_handler);
|
||||
opal_event_signal_del(&sigcont_handler);
|
||||
i = 0;
|
||||
OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) {
|
||||
opal_event_signal_del(forward_signals_events + i);
|
||||
++i;
|
||||
}
|
||||
free (forward_signals_events);
|
||||
forward_signals_events = NULL;
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
|
27
orte/mca/ess/hnp/help-ess-hnp.txt
Обычный файл
27
orte/mca/ess/hnp/help-ess-hnp.txt
Обычный файл
@ -0,0 +1,27 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the SDS base.
|
||||
#
|
||||
[ess-hnp:cannot-forward]
|
||||
The system does not support trapping and forwarding of the
|
||||
specified signal:
|
||||
|
||||
signal: %s
|
||||
param: %s
|
||||
|
||||
Please remove that signal from the ess_hnp_forward_signals MCA parameter.
|
||||
[ess-hnp:unknown-signal]
|
||||
The following signal was included in the ess_hnp_forward_signals
|
||||
MCA parameter:
|
||||
|
||||
signal: %s
|
||||
param: %s
|
||||
|
||||
This is not a recognized signal value. Please fix or remove it.
|
Загрузка…
x
Ссылка в новой задаче
Block a user