1
1

Add new "command" notifier component. This component allows forking

any arbitrary command as a notifier, potentially allowing just about
anything to be a notifier.  This component forks a child during
orte_init() to avoid forking problems with some OS-bypass networks.

The following MCA parameters are available:

notifier_command_cmd:
  Default: /sbin/initlog -f $s -n "Open MPI" -s "$S: $m (errorcode: $e)"
  Command to execute, with substitution.  $s = integer severity; $S =
  string severity; $e = integer error code; $m = string message

notifier_command_timeout:
  Default: 30
  Timeout (in seconds) of the command

This commit was SVN r21076.
Этот коммит содержится в:
Jeff Squyres 2009-04-27 13:40:36 +00:00
родитель e5103e1f3d
Коммит b661f160ba
9 изменённых файлов: 1164 добавлений и 0 удалений

49
orte/mca/notifier/command/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = \
help-orte-notifier-command.txt
sources = \
notifier_command.h \
notifier_command_fd.c \
notifier_command_child.c \
notifier_command_module.c \
notifier_command_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_notifier_command_DSO
component_noinst =
component_install = mca_notifier_command.la
else
component_noinst = libmca_notifier_command.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_notifier_command_la_SOURCES = $(sources)
mca_notifier_command_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_notifier_command_la_SOURCES =$(sources)
libmca_notifier_command_la_LDFLAGS = -module -avoid-version

18
orte/mca/notifier/command/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,18 @@
# -*- command-script -*-
#
# Copyright (c) 2007 Sandia National Laboratories. All rights reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_notifier_command_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_notifier_command_CONFIG], [
# We need fork() and pipe()
AC_CHECK_FUNC([fork],
[AC_CHECK_FUNC([pipe], [$1], [$2])], [$2])
])

24
orte/mca/notifier/command/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,66 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI's SMTP notifier support
#
[command not specified]
Error: the Open MPI command notifier component had no command specified.
#
[bad command]
Error: the command notifier component received a bad command in the
notifier_command_cmd MCA parameter. This usually means that there
are mismatched quotes in the command string. Your MPI job may
continue, but the command notifier has been disabled.
Local host: %s
Command: %s
#
[system call fail]
Error: a system call failed during the setup of the command notifier
component. Open MPI is now going to abort your job.
Local host: %s
System call: %s
Errno: %s (%d)
#
[grandchild fail]
The command notifier process died with a non-zero exit status. This
should not happen. Your MPI job will continue, however, and
notifications will attempt to continue. But you may only see this
message once, even if notifications continue to fail.
Local host: %s
Command: %s
Exit status: %s %d
#
[grandchild did not exit]
ERROR: The command notifier process took too long, but was unable to be
killed by Open MPI (Open MPI tried killing it with SIGTERM and
SIGKILL). This should not happen; you should both check the host
where this occurred to see if there are any notifier processes still
running, and check your notifier command and ensure that it is
functioning properly. Your MPI job will continue, however, and
notifications will attempt to continue. But you may only see this
message once, even if notifications continue to fail.
Local host: %s
Command: %s
Timeout (sec): %d
#
[grandchild timeout]
The command notifier process took too long and was killed by Open MPI.
This should not happen; you should check your notifier command and
ensure that it is functioning properly. Your MPI job will continue,
however, and notifications will attempt to continue. But you may only
see this message once, even if notifications continue to fail.
Local host: %s
Command: %s
Timeout (sec): %d
Exit status: %s %d

105
orte/mca/notifier/command/notifier_command.h Обычный файл
Просмотреть файл

@ -0,0 +1,105 @@
/* -*- C -*-
*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef NOTIFIER_COMMAND_H
#define NOTIFIER_COMMAND_H
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/types.h"
#include "orte/mca/notifier/notifier.h"
BEGIN_C_DECLS
typedef struct {
orte_notifier_base_component_t super;
/* Command to execute */
char *cmd;
/* Timeout of the command (seconds) */
int timeout;
/* Priority of this component */
int priority;
/* Child PID */
pid_t child_pid;
/* Pipe to the child */
int to_child[2];
/* Pipe to the parent */
int to_parent[2];
} orte_notifier_command_component_t;
/*
* Notifier interfaces
*/
ORTE_MODULE_DECLSPEC extern orte_notifier_command_component_t
mca_notifier_command_component;
extern orte_notifier_base_module_t orte_notifier_command_module;
/*
* Pipe commands
*/
typedef enum {
/* Fork/exec a command */
CMD_EXEC,
/* Time to quit */
CMD_TIME_TO_QUIT,
/* Sentinel value */
CMD_MAX
} orte_notifier_command_pipe_cmd_t;
/**
* Simple blocking function to read a specific number of bytes from an
* fd.
*/
int orte_notifier_command_read_fd(int fd, int len, void *buffer);
/**
* Simple blocking function to write a specific number of bytes to an
* fd.
*/
int orte_notifier_command_write_fd(int fd, int len, void *buffer);
/**
* Main entry point for child
*/
void orte_notifier_command_child_main(void);
/**
* Function to split a spint into argv, honoring quoting, etc. (and do
* some error checking of the string)
*/
int orte_notifier_command_split(const char *cmd, char ***argv);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,358 @@
/*
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
* it uses this value in run-time "if" conditionals (vs. compile-time
* #if conditionals). We also don't protect including <pthread.h>.
* That's because this component currently only compiles on Linux and
* Solaris, and both of these OS's have pthreads. Using the run-time
* conditionals gives us bettern compile-time checking, even of code
* that isn't activated.
*
* Note, too, that the functionality in this file does *not* require
* all the heavyweight OMPI thread infrastructure (e.g., from
* --enable-mpi-threads or --enable-progress-threads). All work that
* is done in a separate progress thread is very carefully segregated
* from that of the main thread, and communication back to the main
* thread
*/
#include "orte_config.h"
#include <string.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include <ctype.h>
#include "opal/util/argv.h"
#include "orte/constants.h"
#include "notifier_command.h"
int orte_notifier_command_split(const char *cmd_arg, char ***argv_arg)
{
int i;
char *cmd, *p, *q, *token_start, **argv = NULL;
bool in_space, in_quote, in_2quote;
*argv_arg = NULL;
cmd = strdup(cmd_arg);
if (NULL == cmd) {
return ORTE_ERR_IN_ERRNO;
}
in_space = in_quote = in_2quote = false;
for (token_start = p = cmd; '\0' != *p; ++p) {
/* If we're in a quoted string, all we're doing it looking for
the matching end quote. Note that finding the end quote
does not necessarily mean the end of the token! So use the
normal "I found a space [outside of a quote]" processing to
find the end of the token. */
if (in_quote &&
('\'' == *p && p > token_start && '\\' != *(p - 1))) {
in_quote = false;
} else if (in_2quote &&
('\"' == *p && p > token_start && '\\' != *(p - 1))) {
in_2quote = false;
}
/* If we hit a space, it could be the end of a token -- unless
we're already in a series of spaces. */
else if (!in_quote && !in_2quote && isspace(*p)) {
if (!in_space) {
/* We weren't in a series of spaces, so this was the
end of a token. Save it. */
in_space = true;
*p = '\0';
opal_argv_append_nosize(&argv, token_start);
token_start = p + 1;
} else {
/* We're in a series of spaces, so just move
token_start up to the next character. */
token_start = p + 1;
}
} else {
/* We're not in a series of spaces. We only need to check
if we find ' or " to start a quoted string (in which
case spaces no longer mark the end of a string). */
in_space = false;
if ('\'' == *p) {
in_quote = true;
} else if ('"' == *p) {
in_2quote = true;
}
}
}
if (in_quote || in_2quote) {
free(cmd);
opal_argv_free(argv);
return ORTE_ERR_BAD_PARAM;
}
/* Get the last token, if there is one */
if (!in_space) {
opal_argv_append_nosize(&argv, token_start);
}
/* Replace escapes and non-escaped quotes */
for (i = 0; NULL != argv[i]; ++i) {
for (p = q = argv[i]; '\0' != *p; ++p) {
if ('\\' == *p) {
switch (*(p + 1)) {
/* For quotes, just copy them over and
double-increment p */
case '\'': *q = *(p + 1); ++p; break;
case '"': *q = *(p + 1); ++p; break;
/* For other normal escapes, insert the right code
and double-increment p */
case 'a': *q = '\a'; ++p; break;
case 'b': *q = '\b'; ++p; break;
case 'f': *q = '\f'; ++p; break;
case 'n': *q = '\n'; ++p; break;
case 'r': *q = '\r'; ++p; break;
case 't': *q = '\t'; ++p; break;
case 'v': *q = '\v'; ++p; break;
/* For un-terminated escape, just put in a \. Do
*not* double increment p; it's the end of the
string! */
case '\0': *q = '\\'; break;
/* Otherwise, just copy and double increment */
default: *q = *p; ++p; break;
}
++q;
} else {
/* Don't copy un-escaped quotes */
if ('\'' != *p && '"' != *p) {
*q = *p;
++q;
}
}
}
*q = '\0';
}
*argv_arg = argv;
free(cmd);
return ORTE_SUCCESS;
}
/*
* Die nicely
*/
static void diediedie(int status)
{
/* We don't really have any way to report anything, so just close
the pipe fd and die */
close(mca_notifier_command_component.to_child[0]);
close(mca_notifier_command_component.to_parent[1]);
_exit(status);
}
/*
* Loop over waiting for a child to die
*/
static int do_wait(pid_t pid, int timeout, int *status, bool *exited)
{
pid_t pid2;
time_t t1, t2;
t2 = t1 = time(NULL);
*exited = false;
while (timeout <= 0 || t2 - t1 < timeout) {
pid2 = waitpid(pid, status, WNOHANG);
if (pid2 == pid) {
*exited = true;
return ORTE_SUCCESS;
} else if (pid2 < 0 && EINTR != errno) {
if (ECHILD == errno) {
*exited = true;
return ORTE_ERR_NOT_FOUND;
}
/* What else can we do? */
diediedie(10);
}
/* Let the child run a bit */
usleep(100);
t2 = time(NULL);
}
return ORTE_ERROR;
}
/*
* Fork/exec a command from the parent
*/
static void do_exec(void)
{
pid_t pid;
bool exited, killed;
int sel[3], status;
char *msg, *p, *cmd, **argv = NULL;
orte_notifier_command_component_t *c = &mca_notifier_command_component;
/* First three items on the pipe are: severity, errcode, and
string length (sel = Severity, Errcode, string Length. */
if (ORTE_SUCCESS !=
orte_notifier_command_read_fd(c->to_child[0], sizeof(sel), sel)) {
diediedie(1);
}
/* Malloc out enough space for the string to read */
msg = malloc(sel[2] + 1);
if (NULL == msg) {
diediedie(2);
}
if (ORTE_SUCCESS !=
orte_notifier_command_read_fd(c->to_child[0], sel[2] + 1, msg)) {
diediedie(3);
/* What else can we do? */
}
/* We have all the info. Now build up the string command to
exec. Do the $<foo> replacements. */
cmd = strdup(c->cmd);
if ('\0' != *cmd) {
char *temp;
while (NULL != (p = strstr(cmd, "$s"))) {
*p = '\0';
asprintf(&temp, "%s%d%s", cmd, sel[0], p + 2);
free(cmd);
cmd = temp;
}
while (NULL != (p = strstr(cmd, "$S"))) {
*p = '\0';
asprintf(&temp, "%s%s%s", cmd,
((ORTE_NOTIFIER_INFRA == sel[0]) ? "INFRA" :
((ORTE_NOTIFIER_WARNING == sel[0]) ? "WARNING" :
((ORTE_NOTIFIER_NOTICE == sel[0]) ? "NOTICE" :
"UNKNOWN"))), p + 2);
free(cmd);
cmd = temp;
}
while (NULL != (p = strstr(cmd, "$e"))) {
*p = '\0';
asprintf(&temp, "%s%d%s", cmd, sel[1], p + 2);
free(cmd);
cmd = temp;
}
while (NULL != (p = strstr(cmd, "$m"))) {
*p = '\0';
asprintf(&temp, "%s%s%s", cmd, msg, p + 2);
free(cmd);
cmd = temp;
}
}
/* Now break it up into a list of argv */
if (ORTE_SUCCESS != orte_notifier_command_split(cmd, &argv)) {
diediedie(7);
/* What else can we do? */
}
/* Fork off the child and run the command */
pid = fork();
if (pid < 0) {
diediedie(8);
} else if (pid == 0) {
int i;
int fdmax = sysconf(_SC_OPEN_MAX);
for (i = 3; i < fdmax; ++i) {
close(i);
}
/* Run it! */
execvp(argv[0], argv);
/* If we get here, bad */
diediedie(9);
}
free(cmd);
free(msg);
opal_argv_free(argv);
/* Parent: wait for / reap the child. */
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
/* If it didn't die, try killing it nicely. If that fails, kill
it dead. */
killed = false;
if (!exited) {
killed = true;
kill(pid, SIGTERM);
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
if (!exited) {
kill(pid, SIGKILL);
do_wait(pid, mca_notifier_command_component.timeout, &status,
&exited);
}
}
/* Handshake back up to the parent: just send the status value
back up to the parent and let all interpretation occur up
there. */
sel[0] = (int) exited;
sel[1] = (int) killed;
sel[2] = status;
if (ORTE_SUCCESS !=
orte_notifier_command_write_fd(mca_notifier_command_component.to_parent[1],
sizeof(sel), sel)) {
diediedie(11);
}
}
/*
* Main entry point for child
*/
void orte_notifier_command_child_main(void)
{
orte_notifier_command_pipe_cmd_t cmd;
orte_notifier_command_component_t *c = &mca_notifier_command_component;
while (1) {
/* Block waiting for a command */
cmd = -3;
if (ORTE_SUCCESS !=
orte_notifier_command_read_fd(c->to_child[0], sizeof(cmd), &cmd)) {
diediedie(4);
}
switch (cmd) {
case CMD_EXEC:
do_exec();
break;
case CMD_TIME_TO_QUIT:
diediedie(0);
default:
diediedie(cmd + 50);
}
}
}

Просмотреть файл

@ -0,0 +1,240 @@
/* -*- C -*-
*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* Simple command notifier
*/
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "orte/constants.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "notifier_command.h"
static int command_open(void);
static int command_component_query(mca_base_module_t **module, int *priority);
static int command_close(void);
static int command_register(void);
/*
* Struct of function pointers that need to be initialized
*/
orte_notifier_command_component_t mca_notifier_command_component = {
{
{
ORTE_NOTIFIER_BASE_VERSION_1_0_0,
"command",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
command_open,
command_close,
command_component_query,
command_register,
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
},
/* Command command to run */
"/sbin/initlog -f $s -n \"Open MPI\" -s \"$S: $m (errorcode: $e)\"",
/* Timeout */
30,
/* Priority */
10,
/* PID of child */
-1,
/* To-child pipe FDs */
{ -1, -1 },
/* To-parent pipe FDs */
{ -1, -1 },
};
/* Safety to ensure we don't try to write down a dead pipe */
static void child_death_cb(pid_t pid, int status, void *data)
{
if (pid == mca_notifier_command_component.child_pid) {
OPAL_OUTPUT((0, "Command notifier: child unexpectedly died! Exited, %d, exitstatus %d", WIFEXITED(status), WEXITSTATUS(status)));
mca_notifier_command_component.child_pid = 0;
mca_notifier_command_component.to_child[1] = -1;
}
}
static int command_register(void)
{
mca_base_param_reg_string(&mca_notifier_command_component.super.base_version,
"cmd",
"Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message",
false, false,
mca_notifier_command_component.cmd,
&mca_notifier_command_component.cmd);
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
"timeout",
"Timeout (in seconds) of the command",
false, false,
mca_notifier_command_component.timeout,
&mca_notifier_command_component.timeout);
/* Priority */
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
"priority",
"Priority of this component",
false, false,
mca_notifier_command_component.priority,
&mca_notifier_command_component.priority);
return ORTE_SUCCESS;
}
static int command_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int command_close(void)
{
if (NULL != mca_notifier_command_component.cmd) {
free(mca_notifier_command_component.cmd);
}
/* Tell the child process to die */
if (0 != mca_notifier_command_component.child_pid &&
-1 != mca_notifier_command_component.to_child[1]) {
orte_notifier_command_pipe_cmd_t cmd = CMD_TIME_TO_QUIT;
orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
sizeof(cmd), &cmd);
close(mca_notifier_command_component.to_child[1]);
mca_notifier_command_component.to_child[1] = -1;
close(mca_notifier_command_component.to_parent[0]);
mca_notifier_command_component.to_parent[0] = -1;
}
return ORTE_SUCCESS;
}
static int command_component_query(mca_base_module_t **module, int *priority)
{
char **argv = NULL;
*priority = 0;
*module = NULL;
/* If there's no command, there's no love */
if (NULL == mca_notifier_command_component.cmd ||
'\0' == mca_notifier_command_component.cmd[0]) {
orte_show_help("help-orte-notifier-command.txt",
"command not specified", true);
return ORTE_ERR_NOT_FOUND;
}
/* Attempt to parse the command into argv, just as a basic sanity
check to ensure that it seems to be ok. */
if (ORTE_SUCCESS !=
orte_notifier_command_split(mca_notifier_command_component.cmd, &argv)) {
orte_show_help("help-orte-notifier-command.txt",
"bad command", true, orte_process_info.nodename,
mca_notifier_command_component.cmd);
return ORTE_ERR_BAD_PARAM;
}
opal_argv_free(argv);
/* Create the pipe to be used (it'll be closed in component
close if we're not selected) */
if (0 != pipe(mca_notifier_command_component.to_child) ||
0 != pipe(mca_notifier_command_component.to_parent)) {
int save = errno;
orte_show_help("help-orte-notifier-command.txt",
"system call fail", true, orte_process_info.nodename,
"pipe", save, strerror(save));
errno = save;
return ORTE_ERR_IN_ERRNO;
}
/* Create the child (it'll be shut down in component close if
we're not selected). We create the child very early so that we
do it before any MPI networks are initialized that have
problems with fork(). The child sits on the other end of a
pipe and waits for commands from this main process. Commands
include telling the child to fork/exec a proces and shutting
down. */
mca_notifier_command_component.child_pid = fork();
if (mca_notifier_command_component.child_pid < 0) {
int save = errno;
orte_show_help("help-orte-notifier-command.txt",
"system call fail", true, orte_process_info.nodename,
"fork", save, strerror(save));
errno = save;
return ORTE_ERR_IN_ERRNO;
}
/* Child: close all fd's except the reading pipe and call the
child main routine */
if (0 == mca_notifier_command_component.child_pid) {
int i;
int fdmax = sysconf(_SC_OPEN_MAX);
for (i = 3; i < fdmax; ++i) {
if (i != mca_notifier_command_component.to_child[0] &&
i != mca_notifier_command_component.to_parent[1]) {
close(i);
}
}
orte_notifier_command_child_main();
/* Never returns */
}
/* Parent: close other ends of pipes */
close(mca_notifier_command_component.to_child[0]);
close(mca_notifier_command_component.to_parent[1]);
/* Let's find out if the child unexpectedly dies */
orte_wait_cb(mca_notifier_command_component.child_pid, child_death_cb, 0);
*priority = 10;
*module = (mca_base_module_t *) &orte_notifier_command_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,82 @@
/*
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
* it uses this value in run-time "if" conditionals (vs. compile-time
* #if conditionals). We also don't protect including <pthread.h>.
* That's because this component currently only compiles on Linux and
* Solaris, and both of these OS's have pthreads. Using the run-time
* conditionals gives us bettern compile-time checking, even of code
* that isn't activated.
*
* Note, too, that the functionality in this file does *not* require
* all the heavyweight OMPI thread infrastructure (e.g., from
* --enable-mpi-threads or --enable-progress-threads). All work that
* is done in a separate progress thread is very carefully segregated
* from that of the main thread, and communication back to the main
* thread
*/
#include "orte_config.h"
#include <unistd.h>
#include <errno.h>
#include "orte/constants.h"
#include "notifier_command.h"
/*
* Simple loop over reading from a fd
*/
int orte_notifier_command_read_fd(int fd, int len, void *buffer)
{
int rc;
char *b = buffer;
while (len > 0) {
rc = read(fd, b, len);
if (rc < 0 && EAGAIN == errno) {
continue;
} else if (rc > 0) {
len -= rc;
b += rc;
} else {
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}
/*
* Simple loop over writing to an fd
*/
int orte_notifier_command_write_fd(int fd, int len, void *buffer)
{
int rc;
char *b = buffer;
while (len > 0) {
rc = write(fd, b, len);
if (rc < 0 && EAGAIN == errno) {
continue;
} else if (rc > 0) {
len -= rc;
b += rc;
} else {
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,222 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* Send an email upon notifier events.
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/error.h"
#include "orte/constants.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_command.h"
static void command_log(int severity, int errcode, const char *msg, ...);
static void command_help(int severity, int errcode, const char *filename,
const char *topic, ...);
static void command_peer(int severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, ...);
/* Module */
orte_notifier_base_module_t orte_notifier_command_module = {
NULL,
NULL,
command_log,
command_help,
command_peer
};
/*
* Back-end function to actually tell the child to fork the command
*/
static int send_command(int severity, int errcode, char *msg)
{
/* csel = Command, Severity, Errcode, string Length */
int rc, csel[4];
csel[0] = CMD_EXEC;
csel[1] = severity;
csel[2] = errcode;
csel[3] = strlen(msg);
/* Write the severity, errcode, and string length */
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
sizeof(csel), csel))) {
goto error;
}
/* Now write the message itself */
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
csel[3] + 1, msg))) {
goto error;
}
/* Now read back the grandchild's exit status from the child:
0 = 0/1 indicating whether the grandchild exited or not
1 = 0/1 indicating whether the grandchild timed out/was killed or not
2 = exit status returned by waitpid() (only relevant if exited==1) */
if (ORTE_SUCCESS !=
(rc = orte_notifier_command_read_fd(mca_notifier_command_component.to_parent[0],
sizeof(int) * 2, csel))) {
goto error;
}
/* Did the grandchild exit? */
if (0 == csel[0]) {
orte_show_help("help-orte-notifier-command.txt",
"grandchild did not exit", true,
orte_process_info.nodename,
mca_notifier_command_component.cmd,
mca_notifier_command_component.timeout);
return ORTE_ERROR;
}
/* Did the grandchild timeout? */
if (1 == csel[1]) {
orte_show_help("help-orte-notifier-command.txt",
"grandchild timeout", true,
orte_process_info.nodename,
mca_notifier_command_component.cmd,
mca_notifier_command_component.timeout,
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
return ORTE_ERR_TIMEOUT;
}
/* The grandchild exited in less than the timeout -- yay. Did it
exit cleanly? */
if (WIFEXITED(csel[1]) && 0 == WEXITSTATUS(csel[1])) {
return ORTE_SUCCESS;
}
/* Nope -- didn't exit cleanly, so print a warning. */
orte_show_help("help-orte-notifier-command.txt",
"grandchild fail", true, orte_process_info.nodename,
mca_notifier_command_component.cmd,
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
return ORTE_ERROR;
error:
orte_show_help("help-orte-notifier-command.txt",
"system call fail", true, orte_process_info.nodename,
"write", opal_strerror(rc), rc);
return rc;
}
static void command_log(int severity, int errcode, const char *msg, ...)
{
char *output;
va_list arglist;
/* If there was a message, output it */
va_start(arglist, msg);
vasprintf(&output, msg, arglist);
va_end(arglist);
if (NULL != output) {
send_command(severity, errcode, output);
free(output);
}
}
static void command_help(int severity, int errcode, const char *filename,
const char *topic, ...)
{
va_list arglist;
char *output;
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
if (NULL != output) {
send_command(severity, errcode, output);
free(output);
}
}
static void command_peer(int severity, int errcode,
orte_process_name_t *peer_proc, const char *msg, ...)
{
va_list arglist;
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
char *peer_host = NULL, *peer_name = NULL;
char *pos = buf;
char *errstr = (char*)orte_err2str(errcode);
int len, space = ORTE_NOTIFIER_MAX_BUF;
if (peer_proc) {
peer_host = orte_ess.proc_get_hostname(peer_proc);
peer_name = ORTE_NAME_PRINT(peer_proc);
}
len = snprintf(pos, space,
"While communicating to proc %s on node %s,"
" proc %s on node %s encountered an error ",
peer_name ? peer_name : "UNKNOWN",
peer_host ? peer_host : "UNKNOWN",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename);
space -= len;
pos += len;
if (0 < space) {
if (errstr) {
len = snprintf(pos, space, "'%s':", errstr);
} else {
len = snprintf(pos, space, "(%d):", errcode);
}
space -= len;
pos += len;
}
if (0 < space) {
va_start(arglist, msg);
vsnprintf(pos, space, msg, arglist);
va_end(arglist);
}
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
send_command(severity, errcode, buf);
}