Add new "command" notifier component. This component allows forking
any arbitrary command as a notifier, potentially allowing just about anything to be a notifier. This component forks a child during orte_init() to avoid forking problems with some OS-bypass networks. The following MCA parameters are available: notifier_command_cmd: Default: /sbin/initlog -f $s -n "Open MPI" -s "$S: $m (errorcode: $e)" Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message notifier_command_timeout: Default: 30 Timeout (in seconds) of the command This commit was SVN r21076.
Этот коммит содержится в:
родитель
e5103e1f3d
Коммит
b661f160ba
49
orte/mca/notifier/command/Makefile.am
Обычный файл
49
orte/mca/notifier/command/Makefile.am
Обычный файл
@ -0,0 +1,49 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = \
|
||||||
|
help-orte-notifier-command.txt
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
notifier_command.h \
|
||||||
|
notifier_command_fd.c \
|
||||||
|
notifier_command_child.c \
|
||||||
|
notifier_command_module.c \
|
||||||
|
notifier_command_component.c
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if OMPI_BUILD_notifier_command_DSO
|
||||||
|
component_noinst =
|
||||||
|
component_install = mca_notifier_command.la
|
||||||
|
else
|
||||||
|
component_noinst = libmca_notifier_command.la
|
||||||
|
component_install =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_notifier_command_la_SOURCES = $(sources)
|
||||||
|
mca_notifier_command_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_notifier_command_la_SOURCES =$(sources)
|
||||||
|
libmca_notifier_command_la_LDFLAGS = -module -avoid-version
|
18
orte/mca/notifier/command/configure.m4
Обычный файл
18
orte/mca/notifier/command/configure.m4
Обычный файл
@ -0,0 +1,18 @@
|
|||||||
|
# -*- command-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Sandia National Laboratories. All rights reserved.
|
||||||
|
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# MCA_notifier_command_CONFIG([action-if-found], [action-if-not-found])
|
||||||
|
# -----------------------------------------------------------
|
||||||
|
AC_DEFUN([MCA_notifier_command_CONFIG], [
|
||||||
|
# We need fork() and pipe()
|
||||||
|
AC_CHECK_FUNC([fork],
|
||||||
|
[AC_CHECK_FUNC([pipe], [$1], [$2])], [$2])
|
||||||
|
])
|
24
orte/mca/notifier/command/configure.params
Обычный файл
24
orte/mca/notifier/command/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
# reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# Specific to this module
|
||||||
|
|
||||||
|
PARAM_CONFIG_FILES="Makefile"
|
66
orte/mca/notifier/command/help-orte-notifier-command.txt
Обычный файл
66
orte/mca/notifier/command/help-orte-notifier-command.txt
Обычный файл
@ -0,0 +1,66 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English help file for Open MPI's SMTP notifier support
|
||||||
|
#
|
||||||
|
[command not specified]
|
||||||
|
Error: the Open MPI command notifier component had no command specified.
|
||||||
|
#
|
||||||
|
[bad command]
|
||||||
|
Error: the command notifier component received a bad command in the
|
||||||
|
notifier_command_cmd MCA parameter. This usually means that there
|
||||||
|
are mismatched quotes in the command string. Your MPI job may
|
||||||
|
continue, but the command notifier has been disabled.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Command: %s
|
||||||
|
#
|
||||||
|
[system call fail]
|
||||||
|
Error: a system call failed during the setup of the command notifier
|
||||||
|
component. Open MPI is now going to abort your job.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
System call: %s
|
||||||
|
Errno: %s (%d)
|
||||||
|
#
|
||||||
|
[grandchild fail]
|
||||||
|
The command notifier process died with a non-zero exit status. This
|
||||||
|
should not happen. Your MPI job will continue, however, and
|
||||||
|
notifications will attempt to continue. But you may only see this
|
||||||
|
message once, even if notifications continue to fail.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Command: %s
|
||||||
|
Exit status: %s %d
|
||||||
|
#
|
||||||
|
[grandchild did not exit]
|
||||||
|
ERROR: The command notifier process took too long, but was unable to be
|
||||||
|
killed by Open MPI (Open MPI tried killing it with SIGTERM and
|
||||||
|
SIGKILL). This should not happen; you should both check the host
|
||||||
|
where this occurred to see if there are any notifier processes still
|
||||||
|
running, and check your notifier command and ensure that it is
|
||||||
|
functioning properly. Your MPI job will continue, however, and
|
||||||
|
notifications will attempt to continue. But you may only see this
|
||||||
|
message once, even if notifications continue to fail.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Command: %s
|
||||||
|
Timeout (sec): %d
|
||||||
|
#
|
||||||
|
[grandchild timeout]
|
||||||
|
The command notifier process took too long and was killed by Open MPI.
|
||||||
|
This should not happen; you should check your notifier command and
|
||||||
|
ensure that it is functioning properly. Your MPI job will continue,
|
||||||
|
however, and notifications will attempt to continue. But you may only
|
||||||
|
see this message once, even if notifications continue to fail.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Command: %s
|
||||||
|
Timeout (sec): %d
|
||||||
|
Exit status: %s %d
|
105
orte/mca/notifier/command/notifier_command.h
Обычный файл
105
orte/mca/notifier/command/notifier_command.h
Обычный файл
@ -0,0 +1,105 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#ifndef NOTIFIER_COMMAND_H
|
||||||
|
#define NOTIFIER_COMMAND_H
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "orte/types.h"
|
||||||
|
#include "orte/mca/notifier/notifier.h"
|
||||||
|
|
||||||
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
orte_notifier_base_component_t super;
|
||||||
|
|
||||||
|
/* Command to execute */
|
||||||
|
char *cmd;
|
||||||
|
|
||||||
|
/* Timeout of the command (seconds) */
|
||||||
|
int timeout;
|
||||||
|
|
||||||
|
/* Priority of this component */
|
||||||
|
int priority;
|
||||||
|
|
||||||
|
/* Child PID */
|
||||||
|
pid_t child_pid;
|
||||||
|
|
||||||
|
/* Pipe to the child */
|
||||||
|
int to_child[2];
|
||||||
|
|
||||||
|
/* Pipe to the parent */
|
||||||
|
int to_parent[2];
|
||||||
|
} orte_notifier_command_component_t;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notifier interfaces
|
||||||
|
*/
|
||||||
|
ORTE_MODULE_DECLSPEC extern orte_notifier_command_component_t
|
||||||
|
mca_notifier_command_component;
|
||||||
|
extern orte_notifier_base_module_t orte_notifier_command_module;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pipe commands
|
||||||
|
*/
|
||||||
|
typedef enum {
|
||||||
|
/* Fork/exec a command */
|
||||||
|
CMD_EXEC,
|
||||||
|
|
||||||
|
/* Time to quit */
|
||||||
|
CMD_TIME_TO_QUIT,
|
||||||
|
|
||||||
|
/* Sentinel value */
|
||||||
|
CMD_MAX
|
||||||
|
} orte_notifier_command_pipe_cmd_t;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple blocking function to read a specific number of bytes from an
|
||||||
|
* fd.
|
||||||
|
*/
|
||||||
|
int orte_notifier_command_read_fd(int fd, int len, void *buffer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple blocking function to write a specific number of bytes to an
|
||||||
|
* fd.
|
||||||
|
*/
|
||||||
|
int orte_notifier_command_write_fd(int fd, int len, void *buffer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main entry point for child
|
||||||
|
*/
|
||||||
|
void orte_notifier_command_child_main(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function to split a spint into argv, honoring quoting, etc. (and do
|
||||||
|
* some error checking of the string)
|
||||||
|
*/
|
||||||
|
int orte_notifier_command_split(const char *cmd, char ***argv);
|
||||||
|
|
||||||
|
END_C_DECLS
|
||||||
|
|
||||||
|
#endif
|
358
orte/mca/notifier/command/notifier_command_child.c
Обычный файл
358
orte/mca/notifier/command/notifier_command_child.c
Обычный файл
@ -0,0 +1,358 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
*
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
|
||||||
|
* it uses this value in run-time "if" conditionals (vs. compile-time
|
||||||
|
* #if conditionals). We also don't protect including <pthread.h>.
|
||||||
|
* That's because this component currently only compiles on Linux and
|
||||||
|
* Solaris, and both of these OS's have pthreads. Using the run-time
|
||||||
|
* conditionals gives us bettern compile-time checking, even of code
|
||||||
|
* that isn't activated.
|
||||||
|
*
|
||||||
|
* Note, too, that the functionality in this file does *not* require
|
||||||
|
* all the heavyweight OMPI thread infrastructure (e.g., from
|
||||||
|
* --enable-mpi-threads or --enable-progress-threads). All work that
|
||||||
|
* is done in a separate progress thread is very carefully segregated
|
||||||
|
* from that of the main thread, and communication back to the main
|
||||||
|
* thread
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_ERRNO_H
|
||||||
|
#include <errno.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_WAIT_H
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#endif
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
|
||||||
|
#include "orte/constants.h"
|
||||||
|
|
||||||
|
#include "notifier_command.h"
|
||||||
|
|
||||||
|
|
||||||
|
int orte_notifier_command_split(const char *cmd_arg, char ***argv_arg)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
char *cmd, *p, *q, *token_start, **argv = NULL;
|
||||||
|
bool in_space, in_quote, in_2quote;
|
||||||
|
|
||||||
|
*argv_arg = NULL;
|
||||||
|
cmd = strdup(cmd_arg);
|
||||||
|
if (NULL == cmd) {
|
||||||
|
return ORTE_ERR_IN_ERRNO;
|
||||||
|
}
|
||||||
|
|
||||||
|
in_space = in_quote = in_2quote = false;
|
||||||
|
for (token_start = p = cmd; '\0' != *p; ++p) {
|
||||||
|
/* If we're in a quoted string, all we're doing it looking for
|
||||||
|
the matching end quote. Note that finding the end quote
|
||||||
|
does not necessarily mean the end of the token! So use the
|
||||||
|
normal "I found a space [outside of a quote]" processing to
|
||||||
|
find the end of the token. */
|
||||||
|
if (in_quote &&
|
||||||
|
('\'' == *p && p > token_start && '\\' != *(p - 1))) {
|
||||||
|
in_quote = false;
|
||||||
|
} else if (in_2quote &&
|
||||||
|
('\"' == *p && p > token_start && '\\' != *(p - 1))) {
|
||||||
|
in_2quote = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If we hit a space, it could be the end of a token -- unless
|
||||||
|
we're already in a series of spaces. */
|
||||||
|
else if (!in_quote && !in_2quote && isspace(*p)) {
|
||||||
|
if (!in_space) {
|
||||||
|
/* We weren't in a series of spaces, so this was the
|
||||||
|
end of a token. Save it. */
|
||||||
|
in_space = true;
|
||||||
|
*p = '\0';
|
||||||
|
opal_argv_append_nosize(&argv, token_start);
|
||||||
|
token_start = p + 1;
|
||||||
|
} else {
|
||||||
|
/* We're in a series of spaces, so just move
|
||||||
|
token_start up to the next character. */
|
||||||
|
token_start = p + 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* We're not in a series of spaces. We only need to check
|
||||||
|
if we find ' or " to start a quoted string (in which
|
||||||
|
case spaces no longer mark the end of a string). */
|
||||||
|
in_space = false;
|
||||||
|
if ('\'' == *p) {
|
||||||
|
in_quote = true;
|
||||||
|
} else if ('"' == *p) {
|
||||||
|
in_2quote = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (in_quote || in_2quote) {
|
||||||
|
free(cmd);
|
||||||
|
opal_argv_free(argv);
|
||||||
|
return ORTE_ERR_BAD_PARAM;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the last token, if there is one */
|
||||||
|
if (!in_space) {
|
||||||
|
opal_argv_append_nosize(&argv, token_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace escapes and non-escaped quotes */
|
||||||
|
for (i = 0; NULL != argv[i]; ++i) {
|
||||||
|
for (p = q = argv[i]; '\0' != *p; ++p) {
|
||||||
|
if ('\\' == *p) {
|
||||||
|
switch (*(p + 1)) {
|
||||||
|
/* For quotes, just copy them over and
|
||||||
|
double-increment p */
|
||||||
|
case '\'': *q = *(p + 1); ++p; break;
|
||||||
|
case '"': *q = *(p + 1); ++p; break;
|
||||||
|
|
||||||
|
/* For other normal escapes, insert the right code
|
||||||
|
and double-increment p */
|
||||||
|
case 'a': *q = '\a'; ++p; break;
|
||||||
|
case 'b': *q = '\b'; ++p; break;
|
||||||
|
case 'f': *q = '\f'; ++p; break;
|
||||||
|
case 'n': *q = '\n'; ++p; break;
|
||||||
|
case 'r': *q = '\r'; ++p; break;
|
||||||
|
case 't': *q = '\t'; ++p; break;
|
||||||
|
case 'v': *q = '\v'; ++p; break;
|
||||||
|
|
||||||
|
/* For un-terminated escape, just put in a \. Do
|
||||||
|
*not* double increment p; it's the end of the
|
||||||
|
string! */
|
||||||
|
case '\0': *q = '\\'; break;
|
||||||
|
|
||||||
|
/* Otherwise, just copy and double increment */
|
||||||
|
default: *q = *p; ++p; break;
|
||||||
|
}
|
||||||
|
++q;
|
||||||
|
} else {
|
||||||
|
/* Don't copy un-escaped quotes */
|
||||||
|
if ('\'' != *p && '"' != *p) {
|
||||||
|
*q = *p;
|
||||||
|
++q;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*q = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
*argv_arg = argv;
|
||||||
|
free(cmd);
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Die nicely
|
||||||
|
*/
|
||||||
|
static void diediedie(int status)
|
||||||
|
{
|
||||||
|
/* We don't really have any way to report anything, so just close
|
||||||
|
the pipe fd and die */
|
||||||
|
close(mca_notifier_command_component.to_child[0]);
|
||||||
|
close(mca_notifier_command_component.to_parent[1]);
|
||||||
|
_exit(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Loop over waiting for a child to die
|
||||||
|
*/
|
||||||
|
static int do_wait(pid_t pid, int timeout, int *status, bool *exited)
|
||||||
|
{
|
||||||
|
pid_t pid2;
|
||||||
|
time_t t1, t2;
|
||||||
|
|
||||||
|
t2 = t1 = time(NULL);
|
||||||
|
*exited = false;
|
||||||
|
while (timeout <= 0 || t2 - t1 < timeout) {
|
||||||
|
pid2 = waitpid(pid, status, WNOHANG);
|
||||||
|
if (pid2 == pid) {
|
||||||
|
*exited = true;
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
} else if (pid2 < 0 && EINTR != errno) {
|
||||||
|
if (ECHILD == errno) {
|
||||||
|
*exited = true;
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* What else can we do? */
|
||||||
|
diediedie(10);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Let the child run a bit */
|
||||||
|
usleep(100);
|
||||||
|
t2 = time(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fork/exec a command from the parent
|
||||||
|
*/
|
||||||
|
static void do_exec(void)
|
||||||
|
{
|
||||||
|
pid_t pid;
|
||||||
|
bool exited, killed;
|
||||||
|
int sel[3], status;
|
||||||
|
char *msg, *p, *cmd, **argv = NULL;
|
||||||
|
orte_notifier_command_component_t *c = &mca_notifier_command_component;
|
||||||
|
|
||||||
|
/* First three items on the pipe are: severity, errcode, and
|
||||||
|
string length (sel = Severity, Errcode, string Length. */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
orte_notifier_command_read_fd(c->to_child[0], sizeof(sel), sel)) {
|
||||||
|
diediedie(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Malloc out enough space for the string to read */
|
||||||
|
msg = malloc(sel[2] + 1);
|
||||||
|
if (NULL == msg) {
|
||||||
|
diediedie(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
orte_notifier_command_read_fd(c->to_child[0], sel[2] + 1, msg)) {
|
||||||
|
diediedie(3);
|
||||||
|
/* What else can we do? */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We have all the info. Now build up the string command to
|
||||||
|
exec. Do the $<foo> replacements. */
|
||||||
|
cmd = strdup(c->cmd);
|
||||||
|
if ('\0' != *cmd) {
|
||||||
|
char *temp;
|
||||||
|
|
||||||
|
while (NULL != (p = strstr(cmd, "$s"))) {
|
||||||
|
*p = '\0';
|
||||||
|
asprintf(&temp, "%s%d%s", cmd, sel[0], p + 2);
|
||||||
|
free(cmd);
|
||||||
|
cmd = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (NULL != (p = strstr(cmd, "$S"))) {
|
||||||
|
*p = '\0';
|
||||||
|
asprintf(&temp, "%s%s%s", cmd,
|
||||||
|
((ORTE_NOTIFIER_INFRA == sel[0]) ? "INFRA" :
|
||||||
|
((ORTE_NOTIFIER_WARNING == sel[0]) ? "WARNING" :
|
||||||
|
((ORTE_NOTIFIER_NOTICE == sel[0]) ? "NOTICE" :
|
||||||
|
"UNKNOWN"))), p + 2);
|
||||||
|
free(cmd);
|
||||||
|
cmd = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (NULL != (p = strstr(cmd, "$e"))) {
|
||||||
|
*p = '\0';
|
||||||
|
asprintf(&temp, "%s%d%s", cmd, sel[1], p + 2);
|
||||||
|
free(cmd);
|
||||||
|
cmd = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (NULL != (p = strstr(cmd, "$m"))) {
|
||||||
|
*p = '\0';
|
||||||
|
asprintf(&temp, "%s%s%s", cmd, msg, p + 2);
|
||||||
|
free(cmd);
|
||||||
|
cmd = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now break it up into a list of argv */
|
||||||
|
if (ORTE_SUCCESS != orte_notifier_command_split(cmd, &argv)) {
|
||||||
|
diediedie(7);
|
||||||
|
/* What else can we do? */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fork off the child and run the command */
|
||||||
|
pid = fork();
|
||||||
|
if (pid < 0) {
|
||||||
|
diediedie(8);
|
||||||
|
} else if (pid == 0) {
|
||||||
|
int i;
|
||||||
|
int fdmax = sysconf(_SC_OPEN_MAX);
|
||||||
|
for (i = 3; i < fdmax; ++i) {
|
||||||
|
close(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Run it! */
|
||||||
|
execvp(argv[0], argv);
|
||||||
|
/* If we get here, bad */
|
||||||
|
diediedie(9);
|
||||||
|
}
|
||||||
|
free(cmd);
|
||||||
|
free(msg);
|
||||||
|
opal_argv_free(argv);
|
||||||
|
|
||||||
|
/* Parent: wait for / reap the child. */
|
||||||
|
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
|
||||||
|
|
||||||
|
/* If it didn't die, try killing it nicely. If that fails, kill
|
||||||
|
it dead. */
|
||||||
|
killed = false;
|
||||||
|
if (!exited) {
|
||||||
|
killed = true;
|
||||||
|
kill(pid, SIGTERM);
|
||||||
|
do_wait(pid, mca_notifier_command_component.timeout, &status, &exited);
|
||||||
|
if (!exited) {
|
||||||
|
kill(pid, SIGKILL);
|
||||||
|
do_wait(pid, mca_notifier_command_component.timeout, &status,
|
||||||
|
&exited);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handshake back up to the parent: just send the status value
|
||||||
|
back up to the parent and let all interpretation occur up
|
||||||
|
there. */
|
||||||
|
sel[0] = (int) exited;
|
||||||
|
sel[1] = (int) killed;
|
||||||
|
sel[2] = status;
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
orte_notifier_command_write_fd(mca_notifier_command_component.to_parent[1],
|
||||||
|
sizeof(sel), sel)) {
|
||||||
|
diediedie(11);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Main entry point for child
|
||||||
|
*/
|
||||||
|
void orte_notifier_command_child_main(void)
|
||||||
|
{
|
||||||
|
orte_notifier_command_pipe_cmd_t cmd;
|
||||||
|
orte_notifier_command_component_t *c = &mca_notifier_command_component;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
/* Block waiting for a command */
|
||||||
|
cmd = -3;
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
orte_notifier_command_read_fd(c->to_child[0], sizeof(cmd), &cmd)) {
|
||||||
|
diediedie(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (cmd) {
|
||||||
|
case CMD_EXEC:
|
||||||
|
do_exec();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CMD_TIME_TO_QUIT:
|
||||||
|
diediedie(0);
|
||||||
|
|
||||||
|
default:
|
||||||
|
diediedie(cmd + 50);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
240
orte/mca/notifier/command/notifier_command_component.c
Обычный файл
240
orte/mca/notifier/command/notifier_command_component.c
Обычный файл
@ -0,0 +1,240 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple command notifier
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_STRING_H
|
||||||
|
#include <string.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
|
||||||
|
#include "orte/constants.h"
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
|
||||||
|
#include "notifier_command.h"
|
||||||
|
|
||||||
|
static int command_open(void);
|
||||||
|
static int command_component_query(mca_base_module_t **module, int *priority);
|
||||||
|
static int command_close(void);
|
||||||
|
static int command_register(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Struct of function pointers that need to be initialized
|
||||||
|
*/
|
||||||
|
orte_notifier_command_component_t mca_notifier_command_component = {
|
||||||
|
{
|
||||||
|
{
|
||||||
|
ORTE_NOTIFIER_BASE_VERSION_1_0_0,
|
||||||
|
|
||||||
|
"command",
|
||||||
|
|
||||||
|
ORTE_MAJOR_VERSION,
|
||||||
|
ORTE_MINOR_VERSION,
|
||||||
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
|
command_open,
|
||||||
|
command_close,
|
||||||
|
command_component_query,
|
||||||
|
command_register,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Command command to run */
|
||||||
|
"/sbin/initlog -f $s -n \"Open MPI\" -s \"$S: $m (errorcode: $e)\"",
|
||||||
|
|
||||||
|
/* Timeout */
|
||||||
|
30,
|
||||||
|
|
||||||
|
/* Priority */
|
||||||
|
10,
|
||||||
|
|
||||||
|
/* PID of child */
|
||||||
|
-1,
|
||||||
|
|
||||||
|
/* To-child pipe FDs */
|
||||||
|
{ -1, -1 },
|
||||||
|
|
||||||
|
/* To-parent pipe FDs */
|
||||||
|
{ -1, -1 },
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Safety to ensure we don't try to write down a dead pipe */
|
||||||
|
static void child_death_cb(pid_t pid, int status, void *data)
|
||||||
|
{
|
||||||
|
if (pid == mca_notifier_command_component.child_pid) {
|
||||||
|
OPAL_OUTPUT((0, "Command notifier: child unexpectedly died! Exited, %d, exitstatus %d", WIFEXITED(status), WEXITSTATUS(status)));
|
||||||
|
mca_notifier_command_component.child_pid = 0;
|
||||||
|
mca_notifier_command_component.to_child[1] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int command_register(void)
|
||||||
|
{
|
||||||
|
mca_base_param_reg_string(&mca_notifier_command_component.super.base_version,
|
||||||
|
"cmd",
|
||||||
|
"Command to execute, with substitution. $s = integer severity; $S = string severity; $e = integer error code; $m = string message",
|
||||||
|
false, false,
|
||||||
|
mca_notifier_command_component.cmd,
|
||||||
|
&mca_notifier_command_component.cmd);
|
||||||
|
|
||||||
|
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
|
||||||
|
"timeout",
|
||||||
|
"Timeout (in seconds) of the command",
|
||||||
|
false, false,
|
||||||
|
mca_notifier_command_component.timeout,
|
||||||
|
&mca_notifier_command_component.timeout);
|
||||||
|
|
||||||
|
/* Priority */
|
||||||
|
mca_base_param_reg_int(&mca_notifier_command_component.super.base_version,
|
||||||
|
"priority",
|
||||||
|
"Priority of this component",
|
||||||
|
false, false,
|
||||||
|
mca_notifier_command_component.priority,
|
||||||
|
&mca_notifier_command_component.priority);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int command_open(void)
|
||||||
|
{
|
||||||
|
/* Nothing to do */
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int command_close(void)
|
||||||
|
{
|
||||||
|
if (NULL != mca_notifier_command_component.cmd) {
|
||||||
|
free(mca_notifier_command_component.cmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Tell the child process to die */
|
||||||
|
if (0 != mca_notifier_command_component.child_pid &&
|
||||||
|
-1 != mca_notifier_command_component.to_child[1]) {
|
||||||
|
orte_notifier_command_pipe_cmd_t cmd = CMD_TIME_TO_QUIT;
|
||||||
|
orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||||
|
sizeof(cmd), &cmd);
|
||||||
|
|
||||||
|
close(mca_notifier_command_component.to_child[1]);
|
||||||
|
mca_notifier_command_component.to_child[1] = -1;
|
||||||
|
|
||||||
|
close(mca_notifier_command_component.to_parent[0]);
|
||||||
|
mca_notifier_command_component.to_parent[0] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int command_component_query(mca_base_module_t **module, int *priority)
|
||||||
|
{
|
||||||
|
char **argv = NULL;
|
||||||
|
|
||||||
|
*priority = 0;
|
||||||
|
*module = NULL;
|
||||||
|
|
||||||
|
/* If there's no command, there's no love */
|
||||||
|
if (NULL == mca_notifier_command_component.cmd ||
|
||||||
|
'\0' == mca_notifier_command_component.cmd[0]) {
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"command not specified", true);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Attempt to parse the command into argv, just as a basic sanity
|
||||||
|
check to ensure that it seems to be ok. */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
orte_notifier_command_split(mca_notifier_command_component.cmd, &argv)) {
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"bad command", true, orte_process_info.nodename,
|
||||||
|
mca_notifier_command_component.cmd);
|
||||||
|
return ORTE_ERR_BAD_PARAM;
|
||||||
|
}
|
||||||
|
opal_argv_free(argv);
|
||||||
|
|
||||||
|
/* Create the pipe to be used (it'll be closed in component
|
||||||
|
close if we're not selected) */
|
||||||
|
if (0 != pipe(mca_notifier_command_component.to_child) ||
|
||||||
|
0 != pipe(mca_notifier_command_component.to_parent)) {
|
||||||
|
int save = errno;
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"system call fail", true, orte_process_info.nodename,
|
||||||
|
"pipe", save, strerror(save));
|
||||||
|
errno = save;
|
||||||
|
return ORTE_ERR_IN_ERRNO;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create the child (it'll be shut down in component close if
|
||||||
|
we're not selected). We create the child very early so that we
|
||||||
|
do it before any MPI networks are initialized that have
|
||||||
|
problems with fork(). The child sits on the other end of a
|
||||||
|
pipe and waits for commands from this main process. Commands
|
||||||
|
include telling the child to fork/exec a proces and shutting
|
||||||
|
down. */
|
||||||
|
mca_notifier_command_component.child_pid = fork();
|
||||||
|
if (mca_notifier_command_component.child_pid < 0) {
|
||||||
|
int save = errno;
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"system call fail", true, orte_process_info.nodename,
|
||||||
|
"fork", save, strerror(save));
|
||||||
|
errno = save;
|
||||||
|
return ORTE_ERR_IN_ERRNO;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Child: close all fd's except the reading pipe and call the
|
||||||
|
child main routine */
|
||||||
|
if (0 == mca_notifier_command_component.child_pid) {
|
||||||
|
int i;
|
||||||
|
int fdmax = sysconf(_SC_OPEN_MAX);
|
||||||
|
for (i = 3; i < fdmax; ++i) {
|
||||||
|
if (i != mca_notifier_command_component.to_child[0] &&
|
||||||
|
i != mca_notifier_command_component.to_parent[1]) {
|
||||||
|
close(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_notifier_command_child_main();
|
||||||
|
/* Never returns */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Parent: close other ends of pipes */
|
||||||
|
close(mca_notifier_command_component.to_child[0]);
|
||||||
|
close(mca_notifier_command_component.to_parent[1]);
|
||||||
|
|
||||||
|
/* Let's find out if the child unexpectedly dies */
|
||||||
|
orte_wait_cb(mca_notifier_command_component.child_pid, child_death_cb, 0);
|
||||||
|
|
||||||
|
*priority = 10;
|
||||||
|
*module = (mca_base_module_t *) &orte_notifier_command_module;
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
82
orte/mca/notifier/command/notifier_command_fd.c
Обычный файл
82
orte/mca/notifier/command/notifier_command_fd.c
Обычный файл
@ -0,0 +1,82 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
*
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note: this file is a little fast-n-loose with OMPI_HAVE_THREADS --
|
||||||
|
* it uses this value in run-time "if" conditionals (vs. compile-time
|
||||||
|
* #if conditionals). We also don't protect including <pthread.h>.
|
||||||
|
* That's because this component currently only compiles on Linux and
|
||||||
|
* Solaris, and both of these OS's have pthreads. Using the run-time
|
||||||
|
* conditionals gives us bettern compile-time checking, even of code
|
||||||
|
* that isn't activated.
|
||||||
|
*
|
||||||
|
* Note, too, that the functionality in this file does *not* require
|
||||||
|
* all the heavyweight OMPI thread infrastructure (e.g., from
|
||||||
|
* --enable-mpi-threads or --enable-progress-threads). All work that
|
||||||
|
* is done in a separate progress thread is very carefully segregated
|
||||||
|
* from that of the main thread, and communication back to the main
|
||||||
|
* thread
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include "orte/constants.h"
|
||||||
|
|
||||||
|
#include "notifier_command.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple loop over reading from a fd
|
||||||
|
*/
|
||||||
|
int orte_notifier_command_read_fd(int fd, int len, void *buffer)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
char *b = buffer;
|
||||||
|
|
||||||
|
while (len > 0) {
|
||||||
|
rc = read(fd, b, len);
|
||||||
|
if (rc < 0 && EAGAIN == errno) {
|
||||||
|
continue;
|
||||||
|
} else if (rc > 0) {
|
||||||
|
len -= rc;
|
||||||
|
b += rc;
|
||||||
|
} else {
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple loop over writing to an fd
|
||||||
|
*/
|
||||||
|
int orte_notifier_command_write_fd(int fd, int len, void *buffer)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
char *b = buffer;
|
||||||
|
|
||||||
|
while (len > 0) {
|
||||||
|
rc = write(fd, b, len);
|
||||||
|
if (rc < 0 && EAGAIN == errno) {
|
||||||
|
continue;
|
||||||
|
} else if (rc > 0) {
|
||||||
|
len -= rc;
|
||||||
|
b += rc;
|
||||||
|
} else {
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
222
orte/mca/notifier/command/notifier_command_module.c
Обычный файл
222
orte/mca/notifier/command/notifier_command_module.c
Обычный файл
@ -0,0 +1,222 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send an email upon notifier events.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#ifdef HAVE_STDARG_H
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SIGNAL_H
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/error.h"
|
||||||
|
|
||||||
|
#include "orte/constants.h"
|
||||||
|
#include "orte/mca/ess/ess.h"
|
||||||
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/mca/notifier/base/base.h"
|
||||||
|
|
||||||
|
#include "notifier_command.h"
|
||||||
|
|
||||||
|
|
||||||
|
static void command_log(int severity, int errcode, const char *msg, ...);
|
||||||
|
static void command_help(int severity, int errcode, const char *filename,
|
||||||
|
const char *topic, ...);
|
||||||
|
static void command_peer(int severity, int errcode,
|
||||||
|
orte_process_name_t *peer_proc,
|
||||||
|
const char *msg, ...);
|
||||||
|
|
||||||
|
/* Module */
|
||||||
|
orte_notifier_base_module_t orte_notifier_command_module = {
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
command_log,
|
||||||
|
command_help,
|
||||||
|
command_peer
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Back-end function to actually tell the child to fork the command
|
||||||
|
*/
|
||||||
|
static int send_command(int severity, int errcode, char *msg)
|
||||||
|
{
|
||||||
|
/* csel = Command, Severity, Errcode, string Length */
|
||||||
|
int rc, csel[4];
|
||||||
|
csel[0] = CMD_EXEC;
|
||||||
|
csel[1] = severity;
|
||||||
|
csel[2] = errcode;
|
||||||
|
csel[3] = strlen(msg);
|
||||||
|
|
||||||
|
/* Write the severity, errcode, and string length */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||||
|
sizeof(csel), csel))) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now write the message itself */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
(rc = orte_notifier_command_write_fd(mca_notifier_command_component.to_child[1],
|
||||||
|
csel[3] + 1, msg))) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now read back the grandchild's exit status from the child:
|
||||||
|
0 = 0/1 indicating whether the grandchild exited or not
|
||||||
|
1 = 0/1 indicating whether the grandchild timed out/was killed or not
|
||||||
|
2 = exit status returned by waitpid() (only relevant if exited==1) */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
(rc = orte_notifier_command_read_fd(mca_notifier_command_component.to_parent[0],
|
||||||
|
sizeof(int) * 2, csel))) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
/* Did the grandchild exit? */
|
||||||
|
if (0 == csel[0]) {
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"grandchild did not exit", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
mca_notifier_command_component.cmd,
|
||||||
|
mca_notifier_command_component.timeout);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
/* Did the grandchild timeout? */
|
||||||
|
if (1 == csel[1]) {
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"grandchild timeout", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
mca_notifier_command_component.cmd,
|
||||||
|
mca_notifier_command_component.timeout,
|
||||||
|
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
|
||||||
|
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
|
||||||
|
return ORTE_ERR_TIMEOUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The grandchild exited in less than the timeout -- yay. Did it
|
||||||
|
exit cleanly? */
|
||||||
|
if (WIFEXITED(csel[1]) && 0 == WEXITSTATUS(csel[1])) {
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Nope -- didn't exit cleanly, so print a warning. */
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"grandchild fail", true, orte_process_info.nodename,
|
||||||
|
mca_notifier_command_component.cmd,
|
||||||
|
WIFEXITED(csel[0]) ? "Exit status" : "Signal",
|
||||||
|
WIFEXITED(csel[0]) ? WEXITSTATUS(csel[0]) : WTERMSIG(csel[0]));
|
||||||
|
return ORTE_ERROR;
|
||||||
|
|
||||||
|
error:
|
||||||
|
orte_show_help("help-orte-notifier-command.txt",
|
||||||
|
"system call fail", true, orte_process_info.nodename,
|
||||||
|
"write", opal_strerror(rc), rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void command_log(int severity, int errcode, const char *msg, ...)
|
||||||
|
{
|
||||||
|
char *output;
|
||||||
|
va_list arglist;
|
||||||
|
|
||||||
|
/* If there was a message, output it */
|
||||||
|
va_start(arglist, msg);
|
||||||
|
vasprintf(&output, msg, arglist);
|
||||||
|
va_end(arglist);
|
||||||
|
|
||||||
|
if (NULL != output) {
|
||||||
|
send_command(severity, errcode, output);
|
||||||
|
free(output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void command_help(int severity, int errcode, const char *filename,
|
||||||
|
const char *topic, ...)
|
||||||
|
{
|
||||||
|
va_list arglist;
|
||||||
|
char *output;
|
||||||
|
|
||||||
|
va_start(arglist, topic);
|
||||||
|
output = opal_show_help_vstring(filename, topic, false, arglist);
|
||||||
|
va_end(arglist);
|
||||||
|
|
||||||
|
if (NULL != output) {
|
||||||
|
send_command(severity, errcode, output);
|
||||||
|
free(output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void command_peer(int severity, int errcode,
|
||||||
|
orte_process_name_t *peer_proc, const char *msg, ...)
|
||||||
|
{
|
||||||
|
va_list arglist;
|
||||||
|
char buf[ORTE_NOTIFIER_MAX_BUF + 1];
|
||||||
|
char *peer_host = NULL, *peer_name = NULL;
|
||||||
|
char *pos = buf;
|
||||||
|
char *errstr = (char*)orte_err2str(errcode);
|
||||||
|
int len, space = ORTE_NOTIFIER_MAX_BUF;
|
||||||
|
|
||||||
|
if (peer_proc) {
|
||||||
|
peer_host = orte_ess.proc_get_hostname(peer_proc);
|
||||||
|
peer_name = ORTE_NAME_PRINT(peer_proc);
|
||||||
|
}
|
||||||
|
|
||||||
|
len = snprintf(pos, space,
|
||||||
|
"While communicating to proc %s on node %s,"
|
||||||
|
" proc %s on node %s encountered an error ",
|
||||||
|
peer_name ? peer_name : "UNKNOWN",
|
||||||
|
peer_host ? peer_host : "UNKNOWN",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
orte_process_info.nodename);
|
||||||
|
space -= len;
|
||||||
|
pos += len;
|
||||||
|
|
||||||
|
if (0 < space) {
|
||||||
|
if (errstr) {
|
||||||
|
len = snprintf(pos, space, "'%s':", errstr);
|
||||||
|
} else {
|
||||||
|
len = snprintf(pos, space, "(%d):", errcode);
|
||||||
|
}
|
||||||
|
space -= len;
|
||||||
|
pos += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 < space) {
|
||||||
|
va_start(arglist, msg);
|
||||||
|
vsnprintf(pos, space, msg, arglist);
|
||||||
|
va_end(arglist);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[ORTE_NOTIFIER_MAX_BUF] = '\0';
|
||||||
|
send_command(severity, errcode, buf);
|
||||||
|
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user