Add new "command" notifier component. This component allows forking
any arbitrary command as a notifier, potentially allowing just about anything to be a notifier. This component forks a child during orte_init() to avoid forking problems with some OS-bypass networks. The following MCA parameters are available: notifier_command_cmd: Default: /sbin/initlog -f $s -n "Open MPI" -s "$S: $m (errorcode: $e)" Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message notifier_command_timeout: Default: 30 Timeout (in seconds) of the command This commit was SVN r21076.
Этот коммит содержится в:
родитель
e5103e1f3d
Коммит
b661f160ba
49
orte/mca/notifier/command/Makefile.am
Обычный файл
49
orte/mca/notifier/command/Makefile.am
Обычный файл
@ -0,0 +1,49 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
help-orte-notifier-command.txt
|
||||
|
||||
sources = \
|
||||
notifier_command.h \
|
||||
notifier_command_fd.c \
|
||||
notifier_command_child.c \
|
||||
notifier_command_module.c \
|
||||
notifier_command_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_notifier_command_DSO
|
||||
component_noinst =
|
||||
component_install = mca_notifier_command.la
|
||||
else
|
||||
component_noinst = libmca_notifier_command.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_notifier_command_la_SOURCES = $(sources)
|
||||
mca_notifier_command_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_notifier_command_la_SOURCES =$(sources)
|
||||
libmca_notifier_command_la_LDFLAGS = -module -avoid-version
|
18
orte/mca/notifier/command/configure.m4
Обычный файл
18
orte/mca/notifier/command/configure.m4
Обычный файл
@ -0,0 +1,18 @@
|
||||
# -*- command-script -*-
|
||||
#
|
||||
# Copyright (c) 2007 Sandia National Laboratories. All rights reserved.
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_notifier_command_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_notifier_command_CONFIG], [
|
||||
# We need fork() and pipe()
|
||||
AC_CHECK_FUNC([fork],
|
||||
[AC_CHECK_FUNC([pipe], [$1], [$2])], [$2])
|
||||
])
|
24
orte/mca/notifier/command/configure.params
Обычный файл
24
orte/mca/notifier/command/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
66
orte/mca/notifier/command/help-orte-notifier-command.txt
Обычный файл
66
orte/mca/notifier/command/help-orte-notifier-command.txt
Обычный файл
@ -0,0 +1,66 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI's SMTP notifier support
|
||||
#
|
||||
[command not specified]
|
||||
Error: the Open MPI command notifier component had no command specified.
|
||||
#
|
||||
[bad command]
|
||||
Error: the command notifier component received a bad command in the
|
||||
notifier_command_cmd MCA parameter. This usually means that there
|
||||
are mismatched quotes in the command string. Your MPI job may
|
||||
continue, but the command notifier has been disabled.
|
||||
|
||||
Local host: %s
|
||||
Command: %s
|
||||
#
|
||||
[system call fail]
|
||||
Error: a system call failed during the setup of the command notifier
|
||||
component. Open MPI is now going to abort your job.
|
||||
|
||||
Local host: %s
|
||||
System call: %s
|
||||
Errno: %s (%d)
|
||||
#
|
||||
[grandchild fail]
|
||||
The command notifier process died with a non-zero exit status. This
|
||||
should not happen. Your MPI job will continue, however, and
|
||||
notifications will attempt to continue. But you may only see this
|
||||
message once, even if notifications continue to fail.
|
||||
|
||||
Local host: %s
|
||||
Command: %s
|
||||
Exit status: %s %d
|
||||
#
|
||||
[grandchild did not exit]
|
||||
ERROR: The command notifier process took too long, but was unable to be
|
||||
killed by Open MPI (Open MPI tried killing it with SIGTERM and
|
||||
SIGKILL). This should not happen; you should both check the host
|
||||
where this occurred to see if there are any notifier processes still
|
||||
running, and check your notifier command and ensure that it is
|
||||
functioning properly. Your MPI job will continue, however, and
|
||||
notifications will attempt to continue. But you may only see this
|
||||
message once, even if notifications continue to fail.
|
||||
|
||||
Local host: %s
|
||||
Command: %s
|
||||
Timeout (sec): %d
|
||||
#
|
||||
[grandchild timeout]
|
||||
The command notifier process took too long and was killed by Open MPI.
|
||||
This should not happen; you should check your notifier command and
|
||||
ensure that it is functioning properly. Your MPI job will continue,
|
||||
however, and notifications will attempt to continue. But you may only
|
||||
see this message once, even if notifications continue to fail.
|
||||
|
||||
Local host: %s
|
||||
Command: %s
|
||||
Timeout (sec): %d
|
||||
Exit status: %s %d
|
105
orte/mca/notifier/command/notifier_command.h
Обычный файл
105
orte/mca/notifier/command/notifier_command.h
Обычный файл
@ -0,0 +1,105 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef NOTIFIER_COMMAND_H
|
||||
#define NOTIFIER_COMMAND_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
orte_notifier_base_component_t super;
|
||||
|
||||
/* Command to execute */
|
||||
char *cmd;
|
||||
|
||||
/* Timeout of the command (seconds) */
|
||||
int timeout;
|
||||
|
||||
/* Priority of this component */
|
||||
int priority;
|
||||
|
||||
/* Child PID */
|
||||
pid_t child_pid;
|
||||
|
||||
/* Pipe to the child */
|
||||
int to_child[2];
|
||||
|
||||
/* Pipe to the parent */
|
||||
int to_parent[2];
|
||||
} orte_notifier_command_component_t;
|
||||
|
||||
|
||||
/*
|
||||
* Notifier interfaces
|
||||
*/
|
||||
ORTE_MODULE_DECLSPEC extern orte_notifier_command_component_t
|
||||
mca_notifier_command_component;
|
||||
extern orte_notifier_base_module_t orte_notifier_command_module;
|
||||
|
||||
/*
|
||||
* Pipe commands
|
||||
*/
|
||||
typedef enum {
|
||||
/* Fork/exec a command */
|
||||
CMD_EXEC,
|
||||
|
||||
/* Time to quit */
|
||||
CMD_TIME_TO_QUIT,
|
||||
|
||||
/* Sentinel value */
|
||||
CMD_MAX
|
||||
} orte_notifier_command_pipe_cmd_t;
|
||||
|
||||
|
||||
/**
|
||||
* Simple blocking function to read a specific number of bytes from an
|
||||
* fd.
|
||||
*/
|
||||
int orte_notifier_command_read_fd(int fd, int len, void *buffer);
|
||||
|
||||
/**
|
||||
* Simple blocking function to write a specific number of bytes to an
|
||||
* fd.
|
||||
*/
|
||||
int orte_notifier_command_write_fd(int fd, int len, void *buffer);
|
||||
|
||||
/**
|
||||
* Main entry point for child
|
||||
*/
|
||||
void orte_notifier_command_child_main(void);
|
||||
|
||||
/**
|
||||
* Function to split a spint into argv, honoring quoting, etc. (and do
|
||||
* some error checking of the string)
|
||||
*/
|
||||
int orte_notifier_command_split(const char *cmd, char ***argv);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
358
orte/mca/notifier/command/notifier_command_child.c
Обычный файл
358
orte/mca/notifier/command/notifier_command_child.c
Обычный файл
@ -0,0 +1,358 @@
|
||||
/*
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
|
||||
* it uses this value in run-time "if" conditionals (vs. compile-time
|
||||
* #if conditionals). We also don't protect including <pthread.h>.
|
||||
* That's because this component currently only compiles on Linux and
|
||||
* Solaris, and both of these OS's have pthreads. Using the run-time
|
||||
* conditionals gives us bettern compile-time checking, even of code
|
||||
* that isn't activated.
|
||||
*
|
||||
* Note, too, that the functionality in this file does *not* require
|
||||
* all the heavyweight OMPI thread infrastructure (e.g., from
|
||||
* --enable-mpi-threads or --enable-progress-threads). All work that
|
||||
* is done in a separate progress thread is very carefully segregated
|
||||
* from that of the main thread, and communication back to the main
|
||||
* thread
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <string.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_ERRNO_H
|
||||
#include <errno.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "notifier_command.h"
|
||||
|
||||
|
||||
int orte_notifier_command_split(const char *cmd_arg, char ***argv_arg)
|
||||
{
|
||||
int i;
|
||||
char *cmd, *p, *q, *token_start, **argv = NULL;
|
||||
bool in_space, in_quote, in_2quote;
|
||||
|
||||
*argv_arg = NULL;
|
||||
cmd = strdup(cmd_arg);
|
||||
if (NULL == cmd) {
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
}
|
||||
|
||||
in_space = in_quote = in_2quote = false;
|
||||
for (token_start = p = cmd; '\0' != *p; ++p) {
|
||||
/* If we're in a quoted string, all we're doing it looking for
|
||||
the matching end quote. Note that finding the end quote
|
||||
does not necessarily mean the end of the token! So use the
|
||||
normal "I found a space [outside of a quote]" processing to
|
||||
find the end of the token. */
|
||||
if (in_quote &&
|
||||
('\'' == *p && p > token_start && '\\' != *(p - 1))) {
|
||||
in_quote = false;
|
||||
} else if (in_2quote &&
|
||||
('\"' == *p && p > token_start && '\\' != *(p - 1))) {
|
||||
in_2quote = false;
|
||||
}
|
||||
|
||||
/* If we hit a space, it could be the end of a token -- unless
|
||||
we're already in a series of spaces. */
|
||||
else if (!in_quote && !in_2quote && isspace(*p)) {
|
||||
if (!in_space) {
|
||||
/* We weren't in a series of spaces, so this was the
|
||||
end of a token. Save it. */
|
||||
in_space = true;
|
||||
*p = '\0';
|
||||
opal_argv_append_nosize(&argv, token_start);
|
||||
token_start = p + 1;
|
||||
} else {
|
||||
/* We're in a series of spaces, so just move
|
||||
token_start up to the next character. */
|
||||
token_start = p + 1;
|
||||
}
|
||||
} else {
|
||||
/* We're not in a series of spaces. We only need to check
|
||||
if we find ' or " to start a quoted string (in which
|
||||
case spaces no longer mark the end of a string). */
|
||||
in_space = false;
|
||||
if ('\'' == *p) {
|
||||
in_quote = true;
|
||||
} else if ('"' == *p) {
|
||||
in_2quote = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (in_quote || in_2quote) {
|
||||
free(cmd);
|
||||
opal_argv_free(argv);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* Get the last token, if there is one */
|
||||
if (!in_space) {
|
||||
opal_argv_append_nosize(&argv, token_start);
|
||||
}
|
||||
|
||||
/* Replace escapes and non-escaped quotes */
|
||||
for (i = 0; NULL != argv[i]; ++i) {
|
||||
for (p = q = argv[i]; '\0' != *p; ++p) {
|
||||
if ('\\' == *p) {
|
||||
switch (*(p + 1)) {
|
||||
/* For quotes, just copy them over and
|
||||
double-increment p */
|
||||
case '\'': *q = *(p + 1); ++p; break;
|
||||
case '"': *q = *(p + 1); ++p; break;
|
||||
|
||||
/* For other normal escapes, insert the right code
|
||||
and double-increment p */
|
||||
case 'a': *q = '\a'; ++p; break;
|
||||
case 'b': *q = '\b'; ++p; break;
|
||||
case 'f': *q = '\f'; ++p; break;
|
||||
case 'n': *q = '\n'; ++p; break;
|
||||
case 'r': *q = '\r'; ++p; break;
|
||||
case 't': *q = '\t'; ++p; break;
|
||||
case 'v': *q = '\v'; ++p; break;
|
||||
|
||||
/* For un-terminated escape, just put in a \. Do
|
||||
*not* double increment p; it's the end of the
|
||||
string! */
|
||||
case '\0': *q = '\\'; break;
|
||||
|
||||
/* Otherwise, just copy and double increment */
|
||||
default: *q = *p; ++p; break;
|
||||
}
|
||||
++q;
|
||||
} else {
|
||||
/* Don't copy un-escaped quotes */
|
||||
if ('\'' != *p && '"' != *p) {
|
||||
*q = *p;
|
||||
++q;
|
||||
}
|
||||
}
|
||||
}
|
||||
*q = '\0';
|
||||
}
|
||||
|
||||
*argv_arg = argv;
|
||||
free(cmd);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Die nicely
|
||||
*/
|
||||
static void diediedie(int status)
|
||||
{
|
||||
/* We don't really have any way to report anything, so just close
|
||||
the pipe fd and die */
|
||||
close(mca_notifier_command_component.to_child[0]);
|
||||
close(mca_notifier_command_component.to_parent[1]);
|
||||
_exit(status);
|
||||
}
|
||||
|
||||
/*
|
||||
* Loop over waiting for a child to die
|
||||
*/
|
||||
static int do_wait(pid_t pid, int timeout, int *status, bool *exited)
|
||||
{
|
||||
pid_t pid2;
|
||||
time_t t1, t2;
|
||||
|
||||
t2 = t1 = time(NULL);
|
||||
*exited = false;
|
||||
while (timeout <= 0 || t2 - t1 < timeout) {
|
||||
pid2 = waitpid(pid, status, WNOHANG);
|
||||
if (pid2 == pid) {
|
||||
*exited = true;
|
||||
return ORTE_SUCCESS;
|
||||
} else if (pid2 < 0 && EINTR != errno) {
|
||||
if (ECHILD == errno) {
|
||||
*exited = true;
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* What else can we do? */
|
||||
diediedie(10);
|
||||
}
|
||||
|
||||
/* Let the child run a bit */
|
||||
usleep(100);
|
||||
t2 = time(NULL);
|
||||
}
|
||||
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fork/exec a command from the parent
|
||||
*/
|
||||
static void do_exec(void)
|
||||
{
|
||||
pid_t pid;
|
||||
bool exited, killed;
|
||||
int sel[3], status;
|
||||
char *msg, *p, *cmd, **argv = NULL;
|
||||
orte_notifier_command_component_t *c = &mca_notifier_command_component;
|
||||
|
||||
/* First three items on the pipe are: severity, errcode, and
|
||||
string length (sel = Severity, Errcode, string Length. */
|
||||
if (ORTE_SUCCESS !=
|
||||
orte_notifier_command_read_fd(c->to_child[0], sizeof(sel), sel)) {
|
||||
diediedie(1);
|
||||
}
|
||||
|
||||
/* Malloc out enough space for the string to read */
|
||||
msg = malloc(sel[2] + 1);
|
||||
if (NULL == msg) {
|
||||
diediedie(2);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
orte_notifier_command_read_fd(c->to_child[0], sel[2] + 1, msg)) {
|
||||
diediedie(3);
|
||||
/* What else can we do? */
|
||||
}
|
||||
|
||||
/* We have all the info. Now build up the string command to
|
||||
exec. Do the $<foo> replacements. */
|
||||
cmd = strdup(c->cmd);
|
||||
if ('\0' != *cmd) {
|
||||
char *temp;
|
||||
|
||||
while (NULL != (p = strstr(cmd, "$s"))) {
|
||||
*p = '\0';
|
||||
asprintf(&temp, "%s%d%s", cmd, sel[0], p + 2);
|
||||
free(cmd);
|
||||
cmd = temp;
|
||||
}
|
||||
|
||||
while (NULL != (p = strstr(cmd, "$S"))) {
|
||||
*p = '\0';
|
||||
asprintf(&temp, "%s%s%s", cmd,
|
||||
((ORTE_NOTIFIER_INFRA == sel[0]) ? "INFRA" :
|
||||
((ORTE_NOTIFIER_WARNING == sel[0]) ? "WARNING" :
|
||||
((ORTE_NOTIFIER_NOTICE == sel[0]) ? "NOTICE" :
|
||||
"UNKNOWN"))), p + 2);
|
||||
free(cmd);
|
||||
cmd = temp;
|
||||
}
|
||||
|
||||
while (NULL != (p = strstr(cmd, "$e"))) {
|
||||
*p = '\0';
|
||||
asprintf(&temp, "%s%d%s", cmd, sel[1], p + 2);
|
||||
free(cmd);
|
||||
cmd = temp;
|
||||
}
|
||||
|
||||
while (NULL != (p = strstr(cmd, "$m"))) {
|
||||
*p = '\0';
|
||||
asprintf(&temp, "%s%s%s", cmd, msg, p + 2);
|
||||
free(cmd);
|
||||
cmd = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now break it up into a list of argv */
|
||||
if (ORTE_SUCCESS != orte_notifier_command_split(cmd, &argv)) {
|
||||
diediedie(7);
|
||||
/* What else can we do? */
|
||||
}
|
||||
|
||||
/* Fork off the child and run the command */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
diediedie(8);
|
||||
} else if (pid == 0) {
|
||||
int i;
|
||||
int fdmax = sysconf(_SC_OPEN_MAX);
|
||||
for (i = 3; i < fdmax; ++i) {
|
||||
close(i);
|
||||
}
|
||||
|
||||
/* Run it! */
|
||||
execvp(argv[0], argv);
|
||||
/* If we get here, bad */
|
||||
diediedie(9);
|
||||
}
|
||||
free(cmd);
|
||||
free(msg);
|
||||
opal_argv_free(argv);
|
||||
|
||||
/* Parent: wait for / reap the child. */
|
||||
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
|
||||
|
||||
/* If it didn't die, try killing it nicely. If that fails, kill
|
||||
it dead. */
|
||||
killed = false;
|
||||
if (!exited) {
|
||||
killed = true;
|
||||
kill(pid, SIGTERM);
|
||||
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
|
||||
if (!exited) {
|
||||
kill(pid, SIGKILL);
|
||||
do_wait(pid, mca_notifier_command_component.timeout, &status,
|
||||
&exited);
|
||||
}
|
||||
}
|
||||
|
||||
/* Handshake back up to the parent: just send the status value
|
||||
back up to the parent and let all interpretation occur up
|
||||
there. */
|
||||
sel[0] = (int) exited;
|
||||
sel[1] = (int) killed;
|
||||
sel[2] = status;
|
||||
if (ORTE_SUCCESS !=
|
||||
orte_notifier_command_write_fd(mca_notifier_command_component.to_parent[1],
|
||||
sizeof(sel), sel)) {
|
||||
diediedie(11);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Main entry point for child
|
||||
*/
|
||||
void orte_notifier_command_child_main(void)
|
||||
{
|
||||
orte_notifier_command_pipe_cmd_t cmd;
|
||||
orte_notifier_command_component_t *c = &mca_notifier_command_component;
|
||||
|
||||
while (1) {
|
||||
/* Block waiting for a command */
|
||||
cmd = -3;
|
||||
if (ORTE_SUCCESS !=
|
||||
orte_notifier_command_read_fd(c->to_child[0], sizeof(cmd), &cmd)) {
|
||||
diediedie(4);
|
||||
}
|
||||
|
||||
switch (cmd) {
|
||||
case CMD_EXEC:
|
||||
do_exec();
|
||||
break;
|
||||
|
||||
case CMD_TIME_TO_QUIT:
|
||||
diediedie(0);
|
||||
|
||||
default:
|
||||
diediedie(cmd + 50);
|
||||
}
|
||||
}
|
||||
}
|
240
orte/mca/notifier/command/notifier_command_component.c
Обычный файл
240
orte/mca/notifier/command/notifier_command_component.c
Обычный файл
@ -0,0 +1,240 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/*
|
||||
* Simple command notifier
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/constants.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "notifier_command.h"
|
||||
|
||||
static int command_open(void);
|
||||
static int command_component_query(mca_base_module_t **module, int *priority);
|
||||
static int command_close(void);
|
||||
static int command_register(void);
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
orte_notifier_command_component_t mca_notifier_command_component = {
|
||||
{
|
||||
{
|
||||
ORTE_NOTIFIER_BASE_VERSION_1_0_0,
|
||||
|
||||
"command",
|
||||
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
command_open,
|
||||
command_close,
|
||||
command_component_query,
|
||||
command_register,
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
},
|
||||
|
||||
/* Command command to run */
|
||||
"/sbin/initlog -f $s -n \"Open MPI\" -s \"$S: $m (errorcode: $e)\"",
|
||||
|
||||
/* Timeout */
|
||||
30,
|
||||
|
||||
/* Priority */
|
||||
10,
|
||||
|
||||
/* PID of child */
|
||||
-1,
|
||||
|
||||
/* To-child pipe FDs */
|
||||
{ -1, -1 },
|
||||
|
||||
/* To-parent pipe FDs */
|
||||
{ -1, -1 },
|
||||
};
|
||||
|
||||
/* Safety to ensure we don't try to write down a dead pipe */
|
||||
static void child_death_cb(pid_t pid, int status, void *data)
|
||||
{
|
||||
if (pid == mca_notifier_command_component.child_pid) {
|
||||
OPAL_OUTPUT((0, "Command notifier: child unexpectedly died! Exited, %d, exitstatus %d", WIFEXITED(status), WEXITSTATUS(status)));
|
||||
mca_notifier_command_component.child_pid = 0;
|
||||
mca_notifier_command_component.to_child[1] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static int command_register(void)
|
||||
{
|
||||
mca_base_param_reg_string(&mca_notifier_command_component.super.base_version,
|
||||
"cmd",
|
||||
"Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message",
|
||||
false, false,
|
||||
mca_notifier_command_component.cmd,
|
||||
&mca_notifier_command_component.cmd);
|
||||
|
||||
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
|
||||
"timeout",
|
||||
"Timeout (in seconds) of the command",
|
||||
false, false,
|
||||
mca_notifier_command_component.timeout,
|
||||
&mca_notifier_command_component.timeout);
|
||||
|
||||
/* Priority */
|
||||
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of this component",
|
||||
false, false,
|
||||
mca_notifier_command_component.priority,
|
||||
&mca_notifier_command_component.priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int command_open(void)
|
||||
{
|
||||
/* Nothing to do */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int command_close(void)
|
||||
{
|
||||
if (NULL != mca_notifier_command_component.cmd) {
|
||||
free(mca_notifier_command_component.cmd);
|
||||
}
|
||||
|
||||
/* Tell the child process to die */
|
||||
if (0 != mca_notifier_command_component.child_pid &&
|
||||
-1 != mca_notifier_command_component.to_child[1]) {
|
||||
orte_notifier_command_pipe_cmd_t cmd = CMD_TIME_TO_QUIT;
|
||||
orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||
sizeof(cmd), &cmd);
|
||||
|
||||
close(mca_notifier_command_component.to_child[1]);
|
||||
mca_notifier_command_component.to_child[1] = -1;
|
||||
|
||||
close(mca_notifier_command_component.to_parent[0]);
|
||||
mca_notifier_command_component.to_parent[0] = -1;
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int command_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
char **argv = NULL;
|
||||
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
|
||||
/* If there's no command, there's no love */
|
||||
if (NULL == mca_notifier_command_component.cmd ||
|
||||
'\0' == mca_notifier_command_component.cmd[0]) {
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"command not specified", true);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Attempt to parse the command into argv, just as a basic sanity
|
||||
check to ensure that it seems to be ok. */
|
||||
if (ORTE_SUCCESS !=
|
||||
orte_notifier_command_split(mca_notifier_command_component.cmd, &argv)) {
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"bad command", true, orte_process_info.nodename,
|
||||
mca_notifier_command_component.cmd);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
opal_argv_free(argv);
|
||||
|
||||
/* Create the pipe to be used (it'll be closed in component
|
||||
close if we're not selected) */
|
||||
if (0 != pipe(mca_notifier_command_component.to_child) ||
|
||||
0 != pipe(mca_notifier_command_component.to_parent)) {
|
||||
int save = errno;
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"system call fail", true, orte_process_info.nodename,
|
||||
"pipe", save, strerror(save));
|
||||
errno = save;
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
}
|
||||
|
||||
/* Create the child (it'll be shut down in component close if
|
||||
we're not selected). We create the child very early so that we
|
||||
do it before any MPI networks are initialized that have
|
||||
problems with fork(). The child sits on the other end of a
|
||||
pipe and waits for commands from this main process. Commands
|
||||
include telling the child to fork/exec a proces and shutting
|
||||
down. */
|
||||
mca_notifier_command_component.child_pid = fork();
|
||||
if (mca_notifier_command_component.child_pid < 0) {
|
||||
int save = errno;
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"system call fail", true, orte_process_info.nodename,
|
||||
"fork", save, strerror(save));
|
||||
errno = save;
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
}
|
||||
|
||||
/* Child: close all fd's except the reading pipe and call the
|
||||
child main routine */
|
||||
if (0 == mca_notifier_command_component.child_pid) {
|
||||
int i;
|
||||
int fdmax = sysconf(_SC_OPEN_MAX);
|
||||
for (i = 3; i < fdmax; ++i) {
|
||||
if (i != mca_notifier_command_component.to_child[0] &&
|
||||
i != mca_notifier_command_component.to_parent[1]) {
|
||||
close(i);
|
||||
}
|
||||
}
|
||||
|
||||
orte_notifier_command_child_main();
|
||||
/* Never returns */
|
||||
}
|
||||
|
||||
/* Parent: close other ends of pipes */
|
||||
close(mca_notifier_command_component.to_child[0]);
|
||||
close(mca_notifier_command_component.to_parent[1]);
|
||||
|
||||
/* Let's find out if the child unexpectedly dies */
|
||||
orte_wait_cb(mca_notifier_command_component.child_pid, child_death_cb, 0);
|
||||
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *) &orte_notifier_command_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
82
orte/mca/notifier/command/notifier_command_fd.c
Обычный файл
82
orte/mca/notifier/command/notifier_command_fd.c
Обычный файл
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
|
||||
* it uses this value in run-time "if" conditionals (vs. compile-time
|
||||
* #if conditionals). We also don't protect including <pthread.h>.
|
||||
* That's because this component currently only compiles on Linux and
|
||||
* Solaris, and both of these OS's have pthreads. Using the run-time
|
||||
* conditionals gives us bettern compile-time checking, even of code
|
||||
* that isn't activated.
|
||||
*
|
||||
* Note, too, that the functionality in this file does *not* require
|
||||
* all the heavyweight OMPI thread infrastructure (e.g., from
|
||||
* --enable-mpi-threads or --enable-progress-threads). All work that
|
||||
* is done in a separate progress thread is very carefully segregated
|
||||
* from that of the main thread, and communication back to the main
|
||||
* thread
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "notifier_command.h"
|
||||
|
||||
|
||||
/*
|
||||
* Simple loop over reading from a fd
|
||||
*/
|
||||
int orte_notifier_command_read_fd(int fd, int len, void *buffer)
|
||||
{
|
||||
int rc;
|
||||
char *b = buffer;
|
||||
|
||||
while (len > 0) {
|
||||
rc = read(fd, b, len);
|
||||
if (rc < 0 && EAGAIN == errno) {
|
||||
continue;
|
||||
} else if (rc > 0) {
|
||||
len -= rc;
|
||||
b += rc;
|
||||
} else {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Simple loop over writing to an fd
|
||||
*/
|
||||
int orte_notifier_command_write_fd(int fd, int len, void *buffer)
|
||||
{
|
||||
int rc;
|
||||
char *b = buffer;
|
||||
|
||||
while (len > 0) {
|
||||
rc = write(fd, b, len);
|
||||
if (rc < 0 && EAGAIN == errno) {
|
||||
continue;
|
||||
} else if (rc > 0) {
|
||||
len -= rc;
|
||||
b += rc;
|
||||
} else {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
222
orte/mca/notifier/command/notifier_command_module.c
Обычный файл
222
orte/mca/notifier/command/notifier_command_module.c
Обычный файл
@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/*
|
||||
* Send an email upon notifier events.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#ifdef HAVE_STDARG_H
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/error.h"
|
||||
|
||||
#include "orte/constants.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
|
||||
#include "notifier_command.h"
|
||||
|
||||
|
||||
static void command_log(int severity, int errcode, const char *msg, ...);
|
||||
static void command_help(int severity, int errcode, const char *filename,
|
||||
const char *topic, ...);
|
||||
static void command_peer(int severity, int errcode,
|
||||
orte_process_name_t *peer_proc,
|
||||
const char *msg, ...);
|
||||
|
||||
/* Module */
|
||||
orte_notifier_base_module_t orte_notifier_command_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
command_log,
|
||||
command_help,
|
||||
command_peer
|
||||
};
|
||||
|
||||
/*
|
||||
* Back-end function to actually tell the child to fork the command
|
||||
*/
|
||||
static int send_command(int severity, int errcode, char *msg)
|
||||
{
|
||||
/* csel = Command, Severity, Errcode, string Length */
|
||||
int rc, csel[4];
|
||||
csel[0] = CMD_EXEC;
|
||||
csel[1] = severity;
|
||||
csel[2] = errcode;
|
||||
csel[3] = strlen(msg);
|
||||
|
||||
/* Write the severity, errcode, and string length */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||
sizeof(csel), csel))) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Now write the message itself */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||
csel[3] + 1, msg))) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Now read back the grandchild's exit status from the child:
|
||||
0 = 0/1 indicating whether the grandchild exited or not
|
||||
1 = 0/1 indicating whether the grandchild timed out/was killed or not
|
||||
2 = exit status returned by waitpid() (only relevant if exited==1) */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_notifier_command_read_fd(mca_notifier_command_component.to_parent[0],
|
||||
sizeof(int) * 2, csel))) {
|
||||
goto error;
|
||||
}
|
||||
/* Did the grandchild exit? */
|
||||
if (0 == csel[0]) {
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"grandchild did not exit", true,
|
||||
orte_process_info.nodename,
|
||||
mca_notifier_command_component.cmd,
|
||||
mca_notifier_command_component.timeout);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* Did the grandchild timeout? */
|
||||
if (1 == csel[1]) {
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"grandchild timeout", true,
|
||||
orte_process_info.nodename,
|
||||
mca_notifier_command_component.cmd,
|
||||
mca_notifier_command_component.timeout,
|
||||
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
|
||||
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
|
||||
return ORTE_ERR_TIMEOUT;
|
||||
}
|
||||
|
||||
/* The grandchild exited in less than the timeout -- yay. Did it
|
||||
exit cleanly? */
|
||||
if (WIFEXITED(csel[1]) && 0 == WEXITSTATUS(csel[1])) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Nope -- didn't exit cleanly, so print a warning. */
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"grandchild fail", true, orte_process_info.nodename,
|
||||
mca_notifier_command_component.cmd,
|
||||
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
|
||||
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
|
||||
return ORTE_ERROR;
|
||||
|
||||
error:
|
||||
orte_show_help("help-orte-notifier-command.txt",
|
||||
"system call fail", true, orte_process_info.nodename,
|
||||
"write", opal_strerror(rc), rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void command_log(int severity, int errcode, const char *msg, ...)
|
||||
{
|
||||
char *output;
|
||||
va_list arglist;
|
||||
|
||||
/* If there was a message, output it */
|
||||
va_start(arglist, msg);
|
||||
vasprintf(&output, msg, arglist);
|
||||
va_end(arglist);
|
||||
|
||||
if (NULL != output) {
|
||||
send_command(severity, errcode, output);
|
||||
free(output);
|
||||
}
|
||||
}
|
||||
|
||||
static void command_help(int severity, int errcode, const char *filename,
|
||||
const char *topic, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
char *output;
|
||||
|
||||
va_start(arglist, topic);
|
||||
output = opal_show_help_vstring(filename, topic, false, arglist);
|
||||
va_end(arglist);
|
||||
|
||||
if (NULL != output) {
|
||||
send_command(severity, errcode, output);
|
||||
free(output);
|
||||
}
|
||||
}
|
||||
|
||||
static void command_peer(int severity, int errcode,
|
||||
orte_process_name_t *peer_proc, const char *msg, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
|
||||
char *peer_host = NULL, *peer_name = NULL;
|
||||
char *pos = buf;
|
||||
char *errstr = (char*)orte_err2str(errcode);
|
||||
int len, space = ORTE_NOTIFIER_MAX_BUF;
|
||||
|
||||
if (peer_proc) {
|
||||
peer_host = orte_ess.proc_get_hostname(peer_proc);
|
||||
peer_name = ORTE_NAME_PRINT(peer_proc);
|
||||
}
|
||||
|
||||
len = snprintf(pos, space,
|
||||
"While communicating to proc %s on node %s,"
|
||||
" proc %s on node %s encountered an error ",
|
||||
peer_name ? peer_name : "UNKNOWN",
|
||||
peer_host ? peer_host : "UNKNOWN",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename);
|
||||
space -= len;
|
||||
pos += len;
|
||||
|
||||
if (0 < space) {
|
||||
if (errstr) {
|
||||
len = snprintf(pos, space, "'%s':", errstr);
|
||||
} else {
|
||||
len = snprintf(pos, space, "(%d):", errcode);
|
||||
}
|
||||
space -= len;
|
||||
pos += len;
|
||||
}
|
||||
|
||||
if (0 < space) {
|
||||
va_start(arglist, msg);
|
||||
vsnprintf(pos, space, msg, arglist);
|
||||
va_end(arglist);
|
||||
}
|
||||
|
||||
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
|
||||
send_command(severity, errcode, buf);
|
||||
}
|
Загрузка…
Ссылка в новой задаче
Block a user