1
1

Add a new posix_spawn component to the ODLS framework.

Only selectable when specifically requested via "-mca odls pspawn"

Note that there are several concerns:
  * we aren't getting SIGCHLD calls when the procs terminate
  * we aren't seeing the IO pipes close on termination, though
    we are getting output forwarded to mpirun
  * I haven't found a way to bind the child process prior to exec.
    If we want to use this method, we probably need someone to
    implement a cgroup component for the orte/rtc framework

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-11-30 07:18:46 -08:00
родитель 0fcc996c41
Коммит b5bf0a7f1d
11 изменённых файлов: 888 добавлений и 2 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -474,6 +474,7 @@ orte/test/system/orte_sensor
orte/test/system/event-threads orte/test/system/event-threads
orte/test/system/test-time orte/test/system/test-time
orte/test/system/psm_keygen orte/test/system/psm_keygen
orte/test/system/pspawn
orte/test/system/regex orte/test/system/regex
orte/test/system/orte_errors orte/test/system/orte_errors
orte/test/system/evthread-test orte/test/system/evthread-test

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -85,7 +86,7 @@ int orte_odls_default_component_query(mca_base_module_t **module, int *priority)
* if we do. Hence, we only get here if we CAN build - in which * if we do. Hence, we only get here if we CAN build - in which
* case, we definitely should be considered for selection * case, we definitely should be considered for selection
*/ */
*priority = 1; /* let others override us - we are the default */ *priority = 10; /* let others override us - we are the default */
*module = (mca_base_module_t *) &orte_odls_default_module; *module = (mca_base_module_t *) &orte_odls_default_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

49
orte/mca/odls/pspawn/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ortedata_DATA = help-orte-odls-pspawn.txt
sources = \
odls_pspawn.h \
odls_pspawn_component.c \
odls_pspawn.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_odls_pspawn_DSO
component_noinst =
component_install = mca_odls_pspawn.la
else
component_noinst = libmca_odls_pspawn.la
component_install =
endif
mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_odls_pspawn_la_SOURCES = $(sources)
mca_odls_pspawn_la_LDFLAGS = -module -avoid-version
mca_odls_pspawn_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_odls_pspawn_la_SOURCES =$(sources)
libmca_odls_pspawn_la_LDFLAGS = -module -avoid-version

33
orte/mca/odls/pspawn/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,33 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_odls_pspawn_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_odls_pspawn_CONFIG],[
AC_CONFIG_FILES([orte/mca/odls/pspawn/Makefile])
AC_CHECK_FUNC([posix_spawn], [odls_pspawn_happy="yes"], [odls_pspawn_happy="no"])
AS_IF([test "$odls_pspawn_happy" = "yes"], [$1], [$2])
])dnl

Просмотреть файл

@ -0,0 +1,140 @@
# -*- text -*-
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is a US/English help file.
#
[execve error]
Open MPI tried to fork a new process via the "execve" system call but
failed. Open MPI checks many things before attempting to launch a
child process, but nothing is perfect. This error may be indicative
of another problem on the target host, or even something as silly as
having specified a directory for your application. Your job will now
abort.
Local host: %s
Working dir: %s
Application name: %s
Error: %s
#
[binding not supported]
Open MPI tried to bind a new process, but process binding is not
supported on the host where it was launched. The process was killed
without launching the target application. Your job will now abort.
Local host: %s
Application name: %s
#
[binding generic error]
Open MPI tried to bind a new process, but something went wrong. The
process was killed without launching the target application. Your job
will now abort.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d
#
[bound to everything]
Open MPI tried to bind a new process to a specific set of processors,
but ended up binding it to *all* processors. This means that the new
process is effectively unbound.
This is only a warning -- your job will continue. You can suppress
this warning in the future by setting the odls_warn_if_not_bound MCA
parameter to 0.
Local host: %s
Application name: %s
Location: %s:%d
#
[slot list and paffinity_alone]
Open MPI detected that both a slot list was specified and the MCA
parameter "paffinity_alone" was set to true. Only one of these can be
used at a time. Your job will now abort.
Local host: %s
Application name: %s
#
[iof setup failed]
Open MPI tried to launch a child process but the "IOF child setup"
failed. This should not happen. Your job will now abort.
Local host: %s
Application name: %s
#
[not bound]
WARNING: Open MPI tried to bind a process but failed. This is a
warning only; your job will continue.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d
#
[syscall fail]
A system call failed that should not have. In this particular case,
a warning or error message was not displayed that should have been.
Your job may behave unpredictably after this, or abort.
Local host: %s
Application name: %s
Function: %s
Location: %s:%d
#
[memory not bound]
WARNING: Open MPI tried to bind a process but failed. This is a
warning only; your job will continue, though performance may
be degraded.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d
#
[memory binding error]
Open MPI tried to bind memory for a new process but something went
wrong. The process was killed without launching the target
application. Your job will now abort.
Local host: %s
Application name: %s
Error message: %s
Location: %s:%d
#
[set limit]
Error message received from:
Local host: %s
Application name: %s
Location: %s:%d
Message:
%s
#
[incorrectly-bound]
WARNING: Open MPI incorrectly bound a process to the daemon's cores.
This is a warning only; your job will continue.
Local host: %s
Application name: %s
Location: %s:%d

465
orte/mca/odls/pspawn/odls_pspawn.c Обычный файл
Просмотреть файл

@ -0,0 +1,465 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
* Copyright (c) 2008-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Rutgers, The State University of New Jersey.
* All rights reserved.
* Copyright (c) 2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* There is a complicated sequence of events that occurs when the
* parent forks a child process that is intended to launch the target
* executable.
*
* Before the child process exec's the target executable, it might tri
* to set the affinity of that new child process according to a
* complex series of rules. This binding may fail in a myriad of
* different ways. A lot of this code deals with reporting that error
* occurately to the end user. This is a complex task in itself
* because the child process is not "really" an ORTE process -- all
* error reporting must be proxied up to the parent who can use normal
* ORTE error reporting mechanisms.
*
* Here's a high-level description of what is occurring in this file:
*
* - parent opens a pipe
* - parent forks a child
* - parent blocks reading on the pipe: the pipe will either close
* (indicating that the child successfully exec'ed) or the child will
* write some proxied error data up the pipe
*
* - the child tries to set affinity and do other housekeeping in
* preparation of exec'ing the target executable
* - if the child fails anywhere along the way, it sends a message up
* the pipe to the parent indicating what happened -- including a
* rendered error message detailing the problem (i.e., human-readable).
* - it is important that the child renders the error message: there
* are so many errors that are possible that the child is really the
* only entity that has enough information to make an accuate error string
* to report back to the user.
* - the parent reads this message + rendered string in and uses ORTE
* reporting mechanisms to display it to the user
* - if the problem was only a warning, the child continues processing
* (potentially eventually exec'ing the target executable).
* - if the problem was an error, the child exits and the parent
* handles the death of the child as appropriate (i.e., this ODLS
* simply reports the error -- other things decide what to do).
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <string.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include <signal.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#include <stdlib.h>
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#include <stdarg.h>
#ifdef HAVE_SYS_SELECT_H
#include <sys/select.h>
#endif
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif
#include <ctype.h>
#ifdef HAVE_UTIL_H
#include <util.h>
#endif
#ifdef HAVE_PTY_H
#include <pty.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_TERMIOS_H
#include <termios.h>
# ifdef HAVE_TERMIO_H
# include <termio.h>
# endif
#endif
#ifdef HAVE_LIBUTIL_H
#include <libutil.h>
#endif
#include <spawn.h>
#include "opal/mca/hwloc/hwloc-internal.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/opal_environ.h"
#include "opal/util/show_help.h"
#include "opal/util/sys_limits.h"
#include "opal/util/fd.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/base/iof_base_setup.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rtc/rtc.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/pspawn/odls_pspawn.h"
#include "orte/orted/pmix/pmix_server.h"
/*
* Module functions (function pointers used in a struct)
*/
static int orte_odls_pspawn_launch_local_procs(opal_buffer_t *data);
static int orte_odls_pspawn_kill_local_procs(opal_pointer_array_t *procs);
static int orte_odls_pspawn_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
static int orte_odls_pspawn_restart_proc(orte_proc_t *child);
/*
* Module
*/
orte_odls_base_module_t orte_odls_pspawn_module = {
.get_add_procs_data = orte_odls_base_default_get_add_procs_data,
.launch_local_procs = orte_odls_pspawn_launch_local_procs,
.kill_local_procs = orte_odls_pspawn_kill_local_procs,
.signal_local_procs = orte_odls_pspawn_signal_local_procs,
.restart_proc = orte_odls_pspawn_restart_proc
};
/* deliver a signal to a specified pid. */
static int odls_pspawn_kill_local(pid_t pid, int signum)
{
pid_t pgrp;
#if HAVE_SETPGID
pgrp = getpgid(pid);
if (-1 != pgrp) {
/* target the lead process of the process
* group so we ensure that the signal is
* seen by all members of that group. This
* ensures that the signal is seen by any
* child processes our child may have
* started
*/
pid = -pgrp;
}
#endif
if (0 != kill(pid, signum)) {
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:pspawn:SENT KILL %d TO PID %d GOT ERRNO %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
return errno;
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:pspawn:SENT KILL %d TO PID %d SUCCESS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
return 0;
}
int orte_odls_pspawn_kill_local_procs(opal_pointer_array_t *procs)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
odls_pspawn_kill_local))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/* close all open file descriptors w/ exception of stdin/stdout/stderr
and the pipe up to the parent. */
static int close_open_file_descriptors(posix_spawn_file_actions_t *factions)
{
DIR *dir = opendir("/proc/self/fd");
if (NULL == dir) {
return ORTE_ERR_FILE_OPEN_FAILURE;
}
struct dirent *files;
while (NULL != (files = readdir(dir))) {
if (!isdigit(files->d_name[0])) {
continue;
}
int fd = strtol(files->d_name, NULL, 10);
if (errno == EINVAL || errno == ERANGE) {
closedir(dir);
return ORTE_ERR_TYPE_MISMATCH;
}
if (fd >=3) {
posix_spawn_file_actions_addclose(factions, fd);
}
}
closedir(dir);
return ORTE_SUCCESS;
}
/**
* posix_spawn the specified processes
*/
static int odls_pspawn_fork_local_proc(void *cdptr)
{
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
pid_t pid;
orte_proc_t *child = cd->child;
posix_spawn_file_actions_t factions;
posix_spawnattr_t attrs;
sigset_t sigs;
int rc;
orte_iof_base_io_conf_t *opts = &cd->opts;
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
/* setup the attrs object */
rc = posix_spawnattr_init(&attrs);
if (0 != rc) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = 1;
return ORTE_ERROR;
}
/* set the signal mask in the child process */
sigprocmask(0, 0, &sigs);
sigprocmask(SIG_UNBLOCK, &sigs, 0);
posix_spawnattr_setsigmask(&attrs, &sigs);
/* setup to close all fd's other than stdin/out/err */
rc = posix_spawn_file_actions_init(&factions);
if (0 != rc) {
posix_spawnattr_destroy(&attrs);
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = 1;
return ORTE_ERROR;
}
if (ORTE_SUCCESS != close_open_file_descriptors(&factions)) {
posix_spawn_file_actions_destroy(&factions);
posix_spawnattr_destroy(&attrs);
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = 1;
return ORTE_ERROR;
}
/* close the parent end of the pipes in the child */
if (opts->connect_stdin) {
posix_spawn_file_actions_addclose(&factions, opts->p_stdin[1]);
}
posix_spawn_file_actions_addclose(&factions, opts->p_stdout[0]);
if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
posix_spawn_file_actions_addclose(&factions, opts->p_stderr[0]);
}
/* dup the stdin/stdout/stderr descriptors */
if (opts->usepty) {
/* disable echo */
struct termios term_attrs;
if (tcgetattr(opts->p_stdout[1], &term_attrs) < 0) {
return ORTE_ERR_PIPE_SETUP_FAILURE;
}
term_attrs.c_lflag &= ~ (ECHO | ECHOE | ECHOK |
ECHOCTL | ECHOKE | ECHONL);
term_attrs.c_iflag &= ~ (ICRNL | INLCR | ISTRIP | INPCK | IXON);
term_attrs.c_oflag &= ~ (
#ifdef OCRNL
/* OS X 10.3 does not have this
value defined */
OCRNL |
#endif
ONLCR);
if (tcsetattr(opts->p_stdout[1], TCSANOW, &term_attrs) == -1) {
return ORTE_ERR_PIPE_SETUP_FAILURE;
}
posix_spawn_file_actions_adddup2(&factions, fileno(stdout), opts->p_stdout[1]);
if (orte_iof_base.redirect_app_stderr_to_stdout) {
posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stdout[1]);
}
} else {
if (opts->p_stdout[1] != fileno(stdout)) {
posix_spawn_file_actions_adddup2(&factions, fileno(stdout), opts->p_stdout[1]);
}
if (orte_iof_base.redirect_app_stderr_to_stdout) {
posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stdout[1]);
}
}
if (opts->connect_stdin) {
if (opts->p_stdin[0] != fileno(stdin)) {
posix_spawn_file_actions_adddup2(&factions, fileno(stdin), opts->p_stdin[0]);
}
}
if (opts->p_stderr[1] != fileno(stderr) && !orte_iof_base.redirect_app_stderr_to_stdout) {
posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stderr[1]);
}
/* Fork off the child */
rc = posix_spawn(&pid, cd->app->app, &factions, &attrs, cd->argv, cd->env);
posix_spawn_file_actions_destroy(&factions);
posix_spawnattr_destroy(&attrs);
/* as the parent, close the other ends of the pipes */
if (cd->opts.connect_stdin) {
close(cd->opts.p_stdin[0]);
}
close(cd->opts.p_stdout[1]);
if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
close(cd->opts.p_stderr[1]);
}
if (rc < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
cd->child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
return ORTE_SUCCESS;
}
/**
* Launch all processes allocated to the current node.
*/
int orte_odls_pspawn_launch_local_procs(opal_buffer_t *data)
{
int rc;
orte_jobid_t job;
/* construct the list of children we are to launch */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:pspawn:launch:local failed to construct child list on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
return rc;
}
/* launch the local procs */
ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_pspawn_fork_local_proc);
return ORTE_SUCCESS;
}
/**
* Send a signal to a pid. Note that if we get an error, we set the
* return value and let the upper layer print out the message.
*/
static int send_signal(pid_t pd, int signal)
{
int rc = ORTE_SUCCESS;
pid_t pid;
if (orte_odls_globals.signal_direct_children_only) {
pid = pd;
} else {
#if HAVE_SETPGID
/* send to the process group so that any children of our children
* also receive the signal*/
pid = -pd;
#else
pid = pd;
#endif
}
OPAL_OUTPUT_VERBOSE((1, orte_odls_base_framework.framework_output,
"%s sending signal %d to pid %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:
rc = ORTE_ERR_BAD_PARAM;
break;
case ESRCH:
/* This case can occur when we deliver a signal to a
process that is no longer there. This can happen if
we deliver a signal while the job is shutting down.
This does not indicate a real problem, so just
ignore the error. */
break;
case EPERM:
rc = ORTE_ERR_PERM;
break;
default:
rc = ORTE_ERROR;
}
}
return rc;
}
static int orte_odls_pspawn_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int orte_odls_pspawn_restart_proc(orte_proc_t *child)
{
int rc;
/* restart the local proc */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_pspawn_fork_local_proc))) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:pspawn:restart_proc failed to launch on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
}
return rc;
}

42
orte/mca/odls/pspawn/odls_pspawn.h Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
*/
#ifndef ORTE_ODLS_PSPAWN_H
#define ORTE_ODLS_PSPAWN_H
#include "orte_config.h"
#include "orte/mca/mca.h"
#include "orte/mca/odls/odls.h"
BEGIN_C_DECLS
/*
* ODLS Pspawn module
*/
extern orte_odls_base_module_t orte_odls_pspawn_module;
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_pspawn_component;
END_C_DECLS
#endif /* ORTE_ODLS_PSPAWN_H */

103
orte/mca/odls/pspawn/odls_pspawn_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,103 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ctype.h>
#include "orte/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/pspawn/odls_pspawn.h"
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
static int component_open(void);
static int component_close(void);
static int component_query(mca_base_module_t **module, int *priority);
orte_odls_base_component_t mca_odls_pspawn_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
.version = {
ORTE_ODLS_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "pspawn",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = component_open,
.mca_close_component = component_close,
.mca_query_component = component_query,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int component_open(void)
{
return ORTE_SUCCESS;
}
static int component_query(mca_base_module_t **module, int *priority)
{
/* the base open/select logic protects us against operation when
* we are NOT in a daemon, so we don't have to check that here
*/
/* we have built some logic into the configure.m4 file that checks
* to see if we have "posix_spawn" support and only builds this component
* if we do. Hence, we only get here if we CAN build - in which
* case, we only should be considered for selection if specified
*/
*priority = 1; /* let others override us */
*module = (mca_base_module_t *) &orte_odls_pspawn_module;
return ORTE_SUCCESS;
}
static int component_close(void)
{
return ORTE_SUCCESS;
}

7
orte/mca/odls/pspawn/owner.txt Обычный файл
Просмотреть файл

@ -0,0 +1,7 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: INTEL
status: maintenance

Просмотреть файл

@ -1,7 +1,7 @@
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \ PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \ orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \ orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
mapper reducer opal_hotel orte_dfs ulfm pmixtool threads mapper reducer opal_hotel orte_dfs ulfm pmixtool threads pspawn
all: $(PROGS) all: $(PROGS)

45
orte/test/system/pspawn.c Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <spawn.h>
#include "opal/util/argv.h"
int main(int argc, char* argv[])
{
int rc;
char **pargv = NULL;
pid_t pid;
posix_spawn_file_actions_t factions;
posix_spawnattr_t attrs;
rc = posix_spawnattr_init(&attrs);
if (0 != rc) {
fprintf(stderr, "ERROR INIT ATTRS: %d\n", errno);
exit(1);
}
rc = posix_spawn_file_actions_init(&factions);
if (0 != rc) {
fprintf(stderr, "ERROR INIT FACTIONS: %d\n", errno);
exit(1);
}
posix_spawn_file_actions_addclose(&factions, fileno(stdin));
opal_argv_append_nosize(&pargv, "hostname");
rc = posix_spawn(&pid, "/usr/bin/hostname", NULL, NULL, pargv, NULL);
posix_spawn_file_actions_destroy(&factions);
posix_spawnattr_destroy(&attrs);
sleep(1);
return 0;
}