Remove two stale modules
This commit was SVN r24794.
Этот коммит содержится в:
родитель
b95ede99d5
Коммит
9491fbb60c
@ -1,46 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-plm-rshd.txt
|
||||
|
||||
sources = \
|
||||
plm_rshd.h \
|
||||
plm_rshd_component.c \
|
||||
plm_rshd_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_plm_rshd_DSO
|
||||
component_noinst =
|
||||
component_install = mca_plm_rshd.la
|
||||
else
|
||||
component_noinst = libmca_plm_rshd.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_plm_rshd_la_SOURCES = $(sources)
|
||||
mca_plm_rshd_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_plm_rshd_la_SOURCES =$(sources)
|
||||
libmca_plm_rshd_la_LDFLAGS = -module -avoid-version
|
@ -1,27 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
dnl Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_plm_rshd_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_plm_rshd_CONFIG],[
|
||||
AC_CONFIG_FILES([orte/mca/plm/rshd/Makefile])
|
||||
|
||||
AC_CHECK_FUNC([fork], [$1], [$2])
|
||||
])dnl
|
@ -1,77 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[no-local-orted]
|
||||
The rsh PLS component was not able to find the executable "orted" in
|
||||
your PATH or in the directory where Open MPI/OpenRTE was initially installed,
|
||||
and therefore cannot continue.
|
||||
|
||||
For reference, your current PATH is:
|
||||
|
||||
%s
|
||||
|
||||
We also looked for orted in the following directory:
|
||||
|
||||
%s
|
||||
|
||||
[multiple-prefixes]
|
||||
Specified multiple application contexts using different
|
||||
settings for --prefix. Care should be taken, that corresponding
|
||||
processes are mapped to different nodes. Having multiple prefixes
|
||||
per node is not allowed.
|
||||
|
||||
The previously set prefix was
|
||||
%s
|
||||
|
||||
the prefix to be set overriding:
|
||||
%s
|
||||
|
||||
[concurrency-less-than-zero]
|
||||
The value of the MCA parameter "pls_rsh_num_concurrent" is less than
|
||||
or equal to zero (%d). This parameter is used to determine how many
|
||||
remote agents (typically rsh or ssh) to invoke concurrently while
|
||||
launching parallel jobs.
|
||||
|
||||
This value has automatically be reset to 1; processing will continue.
|
||||
|
||||
[deadlock-params]
|
||||
The rsh launcher has been given a number of %d concurrent daemons to
|
||||
launch and is in a debug-daemons option. However, the total number of
|
||||
daemons to launch (%d) is greater than this value. This is a scenario that
|
||||
will cause the system to deadlock.
|
||||
|
||||
To avoid deadlock, either increase the number of concurrent daemons, or
|
||||
remove the debug-daemons flag.
|
||||
|
||||
[unknown-user]
|
||||
The user (%d) is unknown to the system (i.e. there is no corresponding
|
||||
entry in the password file). Please contact your system administrator
|
||||
for a fix.
|
||||
#
|
||||
[cannot-resolve-shell-with-prefix]
|
||||
The rsh launcher has been given a prefix to use, but could not determine
|
||||
the type of remote shell being used on the remote node. This is a fatal
|
||||
error as we cannot determine how to construct the cmd line to set your
|
||||
remote LD_LIBRARY_PATH and PATH environmental variables.
|
||||
|
||||
The prefix we were given are:
|
||||
|
||||
opal_prefix: %s
|
||||
prefix_dir: %s
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Part of the rshd launcher. See plm_rshd.h for an overview of how it works.
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLM_RSHD_EXPORT_H
|
||||
#define ORTE_PLM_RSHD_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_plm_rshd_component_open(void);
|
||||
int orte_plm_rshd_component_close(void);
|
||||
int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
int orte_plm_rshd_finalize(void);
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_plm_rshd_init(void);
|
||||
int orte_plm_rshd_launch(orte_job_t *jdata);
|
||||
int orte_plm_rshd_terminate_job(orte_jobid_t);
|
||||
int orte_plm_rshd_terminate_orteds(void);
|
||||
int orte_plm_rshd_signal_job(orte_jobid_t, int32_t);
|
||||
|
||||
/**
|
||||
* PLS Component
|
||||
*/
|
||||
struct orte_plm_rshd_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
bool force_rsh;
|
||||
opal_list_t children;
|
||||
int num_children;
|
||||
int num_concurrent;
|
||||
opal_mutex_t lock;
|
||||
opal_condition_t cond;
|
||||
};
|
||||
typedef struct orte_plm_rshd_component_t orte_plm_rshd_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_plm_rshd_component_t mca_plm_rshd_component;
|
||||
extern orte_plm_base_module_t orte_plm_rshd_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLS_RSHD_EXPORT_H */
|
@ -1,151 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/base/plm_base_rsh_support.h"
|
||||
#include "orte/mca/plm/rshd/plm_rshd.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_rshd component version number
|
||||
*/
|
||||
const char *mca_plm_rshd_component_version_string =
|
||||
"Open MPI rshd plm MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_plm_rshd_component_t mca_plm_rshd_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
ORTE_PLM_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"rshd",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
orte_plm_rshd_component_open,
|
||||
orte_plm_rshd_component_close,
|
||||
orte_plm_rshd_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
int orte_plm_rshd_component_open(void)
|
||||
{
|
||||
int tmp;
|
||||
mca_base_component_t *c = &mca_plm_rshd_component.super.base_version;
|
||||
|
||||
/* initialize globals */
|
||||
OBJ_CONSTRUCT(&mca_plm_rshd_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_plm_rshd_component.cond, opal_condition_t);
|
||||
mca_plm_rshd_component.num_children = 0;
|
||||
OBJ_CONSTRUCT(&mca_plm_rshd_component.children, opal_list_t);
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "num_concurrent",
|
||||
"How many plm_rsh_agent instances to invoke concurrently (must be > 0)",
|
||||
false, false, 128, &tmp);
|
||||
if (tmp <= 0) {
|
||||
orte_show_help("help-plm-rsh.txt", "concurrency-less-than-zero",
|
||||
true, tmp);
|
||||
tmp = 1;
|
||||
}
|
||||
mca_plm_rshd_component.num_concurrent = tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "force_rsh",
|
||||
"Force the launcher to always use rsh",
|
||||
false, false, false, &tmp);
|
||||
mca_plm_rshd_component.force_rsh = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_plm_rshd_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
|
||||
/* this isn't an error - we just cannot be selected */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rshd: unable to be used: cannot find \"%s\" in PATH",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rsh_agent));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* we are good - make ourselves available, but at low priority */
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *) &orte_plm_rshd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_plm_rshd_component_close(void)
|
||||
{
|
||||
/* cleanup state */
|
||||
OBJ_DESTRUCT(&mca_plm_rshd_component.lock);
|
||||
OBJ_DESTRUCT(&mca_plm_rshd_component.cond);
|
||||
OBJ_DESTRUCT(&mca_plm_rshd_component.children);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,434 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_SELECT_H
|
||||
#include <sys/select.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_PWD_H
|
||||
#include <pwd.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/base/plm_base_rsh_support.h"
|
||||
#include "orte/mca/plm/rshd/plm_rshd.h"
|
||||
|
||||
static void ssh_child(char *cmd, char **argv) __opal_attribute_noreturn__;
|
||||
|
||||
orte_plm_base_module_t orte_plm_rshd_module = {
|
||||
orte_plm_rshd_init,
|
||||
orte_plm_base_set_hnp_name,
|
||||
orte_plm_rshd_launch,
|
||||
NULL,
|
||||
orte_plm_rshd_terminate_job,
|
||||
orte_plm_rshd_terminate_orteds,
|
||||
NULL,
|
||||
orte_plm_rshd_signal_job,
|
||||
orte_plm_rshd_finalize
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static void set_handler_default(int sig);
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
int orte_plm_rshd_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* since I was selected, setup the rsh launch agent support */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Callback on daemon exit.
|
||||
*/
|
||||
|
||||
static void wait_cb(pid_t pid, int status, void* cbdata)
|
||||
{
|
||||
orte_proc_t *proc = (orte_proc_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* get the associated job object */
|
||||
jdata = orte_get_job_data_object(proc->name.jobid);
|
||||
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s proc %d failed with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)proc->name.vpid, WEXITSTATUS(status)));
|
||||
}
|
||||
/* note that this daemon failed */
|
||||
orte_errmgr.update_state(proc->name.jobid, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_FAILED_TO_START, 0, status);
|
||||
|
||||
/* release any waiting threads */
|
||||
OPAL_THREAD_LOCK(&mca_plm_rshd_component.lock);
|
||||
|
||||
/* decrement our #children */
|
||||
mca_plm_rshd_component.num_children--;
|
||||
|
||||
/* see if we can allow launching to continue */
|
||||
if (mca_plm_rshd_component.num_children <=
|
||||
mca_plm_rshd_component.num_concurrent ||
|
||||
mca_plm_rshd_component.num_children == 0) {
|
||||
opal_condition_signal(&mca_plm_rshd_component.cond);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_plm_rshd_component.lock);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* actually ssh the child */
|
||||
static void ssh_child(char *cmd, char **argv)
|
||||
{
|
||||
char** env;
|
||||
char* var;
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
int fdin;
|
||||
sigset_t sigs;
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* Don't let ssh slurp all of our stdin! */
|
||||
fdin = open("/dev/null", O_RDWR);
|
||||
dup2(fdin, 0);
|
||||
close(fdin);
|
||||
|
||||
/* close all file descriptors w/ exception of stdin/stdout/stderr */
|
||||
for(fd=3; fd<fdmax; fd++)
|
||||
close(fd);
|
||||
|
||||
/* Set signal handlers back to the default. Do this close
|
||||
to the execve() because the event library may (and likely
|
||||
will) reset them. If we don't do this, the event
|
||||
library may have left some set that, at least on some
|
||||
OS's, don't get reset via fork() or exec(). Hence, the
|
||||
orted could be unkillable (for example). */
|
||||
|
||||
set_handler_default(SIGTERM);
|
||||
set_handler_default(SIGINT);
|
||||
set_handler_default(SIGHUP);
|
||||
set_handler_default(SIGPIPE);
|
||||
set_handler_default(SIGCHLD);
|
||||
|
||||
/* Unblock all signals, for many of the same reasons that
|
||||
we set the default handlers, above. This is noticable
|
||||
on Linux where the event library blocks SIGTERM, but we
|
||||
don't want that blocked by the orted (or, more
|
||||
specifically, we don't want it to be blocked by the
|
||||
orted and then inherited by the ORTE processes that it
|
||||
forks, making them unkillable by SIGTERM). */
|
||||
sigprocmask(0, 0, &sigs);
|
||||
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
||||
|
||||
/* exec the cmd */
|
||||
var = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rshd: executing: (%s) [%s]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
cmd, (NULL == var) ? "NULL" : var));
|
||||
if (NULL != var) free(var);
|
||||
|
||||
execve(cmd, argv, env);
|
||||
opal_output(0, "plm:rshd: execv of %s failed with errno=%s(%d)\n",
|
||||
cmd, strerror(errno), errno);
|
||||
exit(-1);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Directly launch each specified process.
|
||||
*/
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
int orte_plm_rshd_launch(orte_job_t *jdata)
|
||||
{
|
||||
char **argv = NULL;
|
||||
char *cmd, *param;
|
||||
int rc, i;
|
||||
bool failed_launch = true;
|
||||
orte_app_context_t *app;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_jobid_t failed_job = ORTE_JOBID_INVALID;
|
||||
orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED;
|
||||
pid_t pid;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
* directly ssh the slave process itself. No mapping
|
||||
* is performed to support this - the caller must
|
||||
* provide all the info required to launch the job,
|
||||
* including the target hosts
|
||||
*/
|
||||
return orte_plm_base_local_slave_launch(jdata);
|
||||
}
|
||||
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rshd: launching job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* default to declaring the job launch as having failed */
|
||||
failed_job = jdata->jobid;
|
||||
|
||||
/* launch each proc */
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* only launch this proc if it isn't already running */
|
||||
if (ORTE_PROC_STATE_LAUNCHED <= proc->state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rshd: launching proc %s on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), proc->nodename));
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
|
||||
continue;
|
||||
}
|
||||
node = (orte_node_t*)proc->node;
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_slave_launch(proc->nodename, app,
|
||||
"orte-bootproxy.sh",
|
||||
&argv, &cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* add the bootproxy cmd line options */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_append_bootproxy_args(app, &argv,
|
||||
proc->name.jobid, proc->name.vpid,
|
||||
jdata->map->num_nodes, jdata->num_procs,
|
||||
proc->node_rank, proc->local_rank,
|
||||
node->num_procs, jdata->total_slots_alloc,
|
||||
false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* final cmd */
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
opal_output(0, "%s plm:rshd: final cmd:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param);
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
/* fork a child to exec the rsh/ssh session */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
rc = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* child */
|
||||
if (pid == 0) {
|
||||
/* do the ssh launch - this will exit if it fails */
|
||||
ssh_child(cmd, argv);
|
||||
}
|
||||
/* father */
|
||||
/* declare the child launched */
|
||||
proc->state = ORTE_PROC_STATE_LAUNCHED;
|
||||
/* track number launched */
|
||||
OPAL_THREAD_LOCK(&mca_plm_rshd_component.lock);
|
||||
if (mca_plm_rshd_component.num_children++ >=
|
||||
mca_plm_rshd_component.num_concurrent) {
|
||||
opal_condition_wait(&mca_plm_rshd_component.cond, &mca_plm_rshd_component.lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_plm_rshd_component.lock);
|
||||
|
||||
/* cleanup */
|
||||
opal_argv_free(argv);
|
||||
argv = NULL;
|
||||
free(cmd);
|
||||
cmd = NULL;
|
||||
|
||||
/* setup callback on sigchild - wait until setup above is complete
|
||||
* as the callback can occur in the call to orte_wait_cb
|
||||
*/
|
||||
orte_wait_cb(pid, wait_cb, (void*)proc);
|
||||
}
|
||||
|
||||
/* flag the launch as successful */
|
||||
failed_launch = false;
|
||||
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
||||
jdata->state = ORTE_JOB_STATE_LAUNCHED;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != cmd) {
|
||||
free(cmd);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminate all processes for a given job
|
||||
*/
|
||||
int orte_plm_rshd_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* No orteds to terminate
|
||||
*/
|
||||
int orte_plm_rshd_terminate_orteds(void)
|
||||
{
|
||||
orte_quit();
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_plm_rshd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
/* no way to do this */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_plm_rshd_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static void set_handler_default(int sig)
|
||||
{
|
||||
struct sigaction act;
|
||||
|
||||
act.sa_handler = SIG_DFL;
|
||||
act.sa_flags = 0;
|
||||
sigemptyset(&act.sa_mask);
|
||||
|
||||
sigaction(sig, &act, (struct sigaction *)0);
|
||||
}
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
rhc
|
@ -1,53 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(plm_tmd_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-plm-tmd.txt
|
||||
|
||||
sources = \
|
||||
plm_tmd.h \
|
||||
plm_tmd_component.c \
|
||||
plm_tmd_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_plm_tmd_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_plm_tmd.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_plm_tmd.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_plm_tmd_la_SOURCES = $(component_sources)
|
||||
mca_plm_tmd_la_LDFLAGS = -module -avoid-version $(plm_tmd_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_plm_tmd_la_SOURCES = $(lib_sources)
|
||||
libmca_plm_tmd_la_LDFLAGS = -module -avoid-version $(plm_tmd_LDFLAGS)
|
||||
libmca_plm_tmd_la_LIBADD = $(plm_tmd_LIBS)
|
@ -1,40 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_plm_tmd_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_plm_tmd_CONFIG],[
|
||||
AC_CONFIG_FILES([orte/mca/plm/tmd/Makefile])
|
||||
|
||||
ORTE_CHECK_TM([plm_tmd], [plm_tmd_good=1], [plm_tmd_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$plm_tmd_good" = "1"],
|
||||
[plm_tmd_WRAPPER_EXTRA_LDFLAGS="$plm_tmd_LDFLAGS"
|
||||
plm_tmd_WRAPPER_EXTRA_LIBS="$plm_tmd_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([plm_tmd_CPPFLAGS])
|
||||
AC_SUBST([plm_tmd_LDFLAGS])
|
||||
AC_SUBST([plm_tmd_LIBS])
|
||||
])dnl
|
@ -1,52 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[tmd-bad-launchid]
|
||||
The TM (PBS / Torque) process starter cannot spawn the specified
|
||||
application on a remote node due to an invalid launch_id.
|
||||
|
||||
This is most likely due to use of the "--hostfile" option to the
|
||||
command line with one or more hosts in that file not having
|
||||
been allocated to this job.
|
||||
|
||||
Removing "--hostfile" from the command line will likely allow the
|
||||
application to be launched.
|
||||
#
|
||||
[multiple-prefixes]
|
||||
Multiple different --prefix options were specified to mpirun for the
|
||||
same node. This is a fatal error for the TM (PBS / Torque) process
|
||||
starter in Open MPI.
|
||||
|
||||
The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[tmd-spawn-failed]
|
||||
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
|
||||
on a remote node.
|
||||
|
||||
Command line: %s
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Ensure that the executable "orted" is in your PATH
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
@ -1,43 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLM_TMD_EXPORT_H
|
||||
#define ORTE_PLM_TMD_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_plm_tmd_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
bool want_path_check;
|
||||
char **checked_paths;
|
||||
};
|
||||
typedef struct orte_plm_tmd_component_t orte_plm_tmd_component_t;
|
||||
|
||||
/* Globally exported variables */
|
||||
ORTE_DECLSPEC extern orte_plm_tmd_component_t mca_plm_tmd_component;
|
||||
extern orte_plm_base_module_t orte_plm_tmd_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLM_TMD_EXPORT_H */
|
@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_tmd.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_tm component version number
|
||||
*/
|
||||
const char *mca_plm_tmd_component_version_string =
|
||||
"Open MPI tmd plm MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int plm_tmd_open(void);
|
||||
static int plm_tmd_close(void);
|
||||
static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_plm_tmd_component_t mca_plm_tmd_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
ORTE_PLM_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"tmd",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
plm_tmd_open,
|
||||
plm_tmd_close,
|
||||
orte_plm_tmd_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int plm_tmd_open(void)
|
||||
{
|
||||
int tmp;
|
||||
mca_base_component_t *comp = &mca_plm_tmd_component.super.base_version;
|
||||
|
||||
mca_base_param_reg_int(comp, "want_path_check",
|
||||
"Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
|
||||
false, false, (int) true, &tmp);
|
||||
mca_plm_tmd_component.want_path_check = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_plm_tmd_component.checked_paths = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_close(void)
|
||||
{
|
||||
if (NULL != mca_plm_tmd_component.checked_paths) {
|
||||
opal_argv_free(mca_plm_tmd_component.checked_paths);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* Are we running under a TM job? */
|
||||
|
||||
if (NULL != getenv("PBS_ENVIRONMENT") &&
|
||||
NULL != getenv("PBS_JOBID")) {
|
||||
|
||||
*priority = 2;
|
||||
*module = (mca_base_module_t *) &orte_plm_tmd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -1,834 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#ifdef HAVE_SCHED_H
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <tm.h>
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_tmd.h"
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int plm_tmd_init(void);
|
||||
static int plm_tmd_launch_job(orte_job_t *jdata);
|
||||
static int plm_tmd_terminate_job(orte_jobid_t jobid);
|
||||
static int plm_tmd_terminate_orteds(void);
|
||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_tmd_finalize(void);
|
||||
|
||||
static int plm_tmd_connect(void);
|
||||
static void failed_start(int fd, short event, void *arg);
|
||||
static int obit_submit(int tid);
|
||||
|
||||
/*
|
||||
* Local "global" variables
|
||||
*/
|
||||
static opal_event_t *ev=NULL;
|
||||
static bool connected;
|
||||
static tm_event_t *events_spawn = NULL;
|
||||
static tm_event_t *events_obit = NULL;
|
||||
static tm_task_id *tm_task_ids = NULL;
|
||||
static int *evs = NULL;
|
||||
static bool time_is_up;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_plm_base_module_t orte_plm_tmd_module = {
|
||||
plm_tmd_init,
|
||||
orte_plm_base_set_hnp_name,
|
||||
plm_tmd_launch_job,
|
||||
NULL,
|
||||
plm_tmd_terminate_job,
|
||||
plm_tmd_terminate_orteds,
|
||||
NULL,
|
||||
plm_tmd_signal_job,
|
||||
plm_tmd_finalize
|
||||
};
|
||||
|
||||
/* catch timeout to allow cmds to progress */
|
||||
static void timer_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
opal_event_t *ev = (opal_event_t*)cbdata;
|
||||
|
||||
/* free event */
|
||||
if (NULL != ev) {
|
||||
free(ev);
|
||||
}
|
||||
/* declare time is up */
|
||||
time_is_up = true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
static int plm_tmd_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdatorted;
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
int proc_vpid_index;
|
||||
char *param;
|
||||
char **env = NULL;
|
||||
char *var;
|
||||
char **argv = NULL;
|
||||
int argc = 0;
|
||||
int rc;
|
||||
orte_std_cntr_t launched = 0, i;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
int local_err;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED;
|
||||
int offset;
|
||||
tm_event_t eventpolled;
|
||||
orte_std_cntr_t num_daemons;
|
||||
opal_event_t *timerev;
|
||||
int j;
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
connected = false;
|
||||
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launching job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* Get the map for this job */
|
||||
if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
nodes = (orte_node_t**)map->nodes->addr;
|
||||
|
||||
if (0 == map->num_new_daemons) {
|
||||
/* have all the daemons we need - launch app */
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* lookup the daemon job object - must do this -after- the job is
|
||||
* setup so the number of required daemons has been updated
|
||||
*/
|
||||
if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
num_daemons = jdatorted->num_procs - 1; /* do not include myself as I am already here! */
|
||||
if (0 >= num_daemons) {
|
||||
/* this won't work */
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use */
|
||||
if (NULL == events_spawn) {
|
||||
/* spawn events for first launch */
|
||||
events_spawn = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_spawn = (tm_event_t*)realloc(events_spawn, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == events_obit) {
|
||||
/* obit events for first launch */
|
||||
events_obit = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_obit = (tm_event_t*)realloc(events_obit, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == evs) {
|
||||
/* evs for first launch */
|
||||
evs = (int*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
evs = (int*)realloc(evs, sizeof(int) * num_daemons);
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* allocate task ids for the orteds */
|
||||
if (NULL == tm_task_ids) {
|
||||
/* first launch */
|
||||
tm_task_ids = (tm_task_id*)malloc(num_daemons * sizeof(tm_task_id));
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
tm_task_ids = (tm_task_id*)realloc(tm_task_ids, sizeof(tm_task_id) * num_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the offset into the event/task arrays */
|
||||
offset = num_daemons - map->num_new_daemons;
|
||||
|
||||
/* initialize them */
|
||||
for (i=0; i < map->num_new_daemons; i++) {
|
||||
*(tm_task_ids + offset + i) = TM_NULL_TASK;
|
||||
*(events_spawn + offset + i) = TM_NULL_EVENT;
|
||||
*(events_obit + offset + i) = TM_NULL_EVENT;
|
||||
*(evs + offset + i) = 0;
|
||||
}
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
||||
&proc_vpid_index,
|
||||
true, NULL);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
connected = true;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in plm_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* add our umask -- see big note in orted.c */
|
||||
current_umask = umask(0);
|
||||
umask(current_umask);
|
||||
asprintf(&var, "0%o", current_umask);
|
||||
opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
|
||||
free(var);
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables. We only allow
|
||||
a single prefix to be specified. Since there will
|
||||
always be at least one app_context, we take it from
|
||||
there
|
||||
*/
|
||||
if (NULL != apps[0]->prefix_dir) {
|
||||
char *newenv;
|
||||
|
||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, bin_base, env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, lib_base, env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
for (i = 0; i < map->num_nodes; i++) {
|
||||
orte_node_t* node = nodes[i];
|
||||
char* vpid_string;
|
||||
|
||||
/* if this daemon already exists, don't launch it! */
|
||||
if (node->daemon_launched) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launching on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
|
||||
/* setup process name */
|
||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tm: unable to get daemon vpid as string");
|
||||
goto cleanup;
|
||||
}
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(vpid_string);
|
||||
free(vpid_string);
|
||||
|
||||
/* exec the daemon */
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: executing:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + offset + launched, events_spawn + offset + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], node->name, node->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launched++;
|
||||
|
||||
/* Allow some progress to occur */
|
||||
opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished spawning orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup a timer to give the cmd a chance to be sent */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, launched,
|
||||
100, -1, timer_cb);
|
||||
|
||||
ORTE_PROGRESSED_WAIT(time_is_up, 0, 1);
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
while (0 < launched) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tm: event poll for spawned daemon failed, return status = %d", rc);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
continue;
|
||||
}
|
||||
/* look for the spawned event */
|
||||
for (j=0; j < map->num_new_daemons; j++) {
|
||||
if (eventpolled == *(events_spawn + offset + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err) {
|
||||
/* this orted failed to launch! */
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], nodes[j]->name, nodes[j]->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
/* register the corresponding obit so we can detect when this
|
||||
* orted terminates
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(offset+j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND SPAWN EVENT WHEN LAUNCHING");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
|
||||
MOVEON:
|
||||
launched--;
|
||||
}
|
||||
|
||||
/* set a timer to tell us if one or more daemon's fails to start - use the
|
||||
* millisec/daemon timeout provided by the user to compute time
|
||||
*/
|
||||
if (0 < orte_startup_timeout) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: setting startup timer for %d milliseconds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_startup_timeout));
|
||||
ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons,
|
||||
orte_startup_timeout*1000,
|
||||
-1, failed_start);
|
||||
}
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: daemon launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if issued, cancel the failed-to-start timer */
|
||||
if (NULL != ev) {
|
||||
opal_event_del(ev);
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
/* since the daemons have launched, any failures now will be for the
|
||||
* application job
|
||||
*/
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we get here, then everything launched okay - record that fact */
|
||||
failed_launch = false;
|
||||
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
if (NULL != bin_base) {
|
||||
free(bin_base);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int plm_tmd_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t **daemons;
|
||||
tm_event_t eventpolled;
|
||||
orte_vpid_t j, alive;
|
||||
int local_err;
|
||||
opal_event_t *timerev=NULL;
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
bool aborted;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: terminating orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* lookup the daemon job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
}
|
||||
alive = jdata->num_procs - 1; /* do not include myself! */
|
||||
daemons = (orte_proc_t**)jdata->procs->addr;
|
||||
aborted = false;
|
||||
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* if there are more than just me... */
|
||||
if (0 < alive) {
|
||||
/* setup a max time for the daemons to die */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, alive,
|
||||
1000000, 60000000, timer_cb);
|
||||
|
||||
/* give the cmds a chance to get out */
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, alive, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* now begin polling to see if daemons have terminated */
|
||||
while (!time_is_up && 0 < alive) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: polling for daemon termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tm: event poll for daemon termination failed, return status = %d", rc);
|
||||
continue; /* we will wait for timeout to tell us to quit */
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got null event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_event_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
continue;
|
||||
}
|
||||
/* look for the obit event */
|
||||
for (j=0; j < jdata->num_procs-1; j++) {
|
||||
if (eventpolled == *(events_obit + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err == TM_ESYSTEM) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got TM_ESYSTEM on obit - resubmitting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto MOVEON;
|
||||
}
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_event_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
if (0 != local_err) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got error %d on obit for task %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_err, j));
|
||||
rc = ORTE_ERROR;
|
||||
goto MOVEON;
|
||||
}
|
||||
/* this daemon has terminated */
|
||||
*(tm_task_ids+j) = TM_NULL_TASK;
|
||||
*(events_obit+j) = TM_NULL_EVENT;
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: task %d exited with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, *(evs+j)));
|
||||
/* update the termination status for this daemon */
|
||||
daemons[j+1]->exit_code = *(evs+j);
|
||||
if (0 != daemons[j+1]->exit_code) {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_ABORTED;
|
||||
aborted = true;
|
||||
} else {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_TERMINATED;
|
||||
}
|
||||
jdata->num_terminated++;
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND OBIT EVENT");
|
||||
|
||||
MOVEON:
|
||||
alive--;
|
||||
}
|
||||
|
||||
/* release event if not already done */
|
||||
if (NULL != quicktime) {
|
||||
free(quicktime);
|
||||
}
|
||||
if (NULL != timerev) {
|
||||
opal_event_del(timerev);
|
||||
free(timerev);
|
||||
}
|
||||
} else {
|
||||
/* still need to give the cmds a chance to get out so I can process
|
||||
* them myself!
|
||||
*/
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 1, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
|
||||
/* declare the daemons done */
|
||||
if (aborted || 0 < alive) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
orte_quit();
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to pass this signal to their local procs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free stuff
|
||||
*/
|
||||
static int plm_tmd_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
tm_finalize();
|
||||
}
|
||||
|
||||
/* cleanup data arrays */
|
||||
if (NULL != events_spawn) {
|
||||
free(events_spawn);
|
||||
}
|
||||
if (NULL != events_obit) {
|
||||
free(events_obit);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
if (NULL != evs) {
|
||||
free(evs);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_connect(void)
|
||||
{
|
||||
int ret;
|
||||
struct tm_roots tm_root;
|
||||
int count, progress;
|
||||
|
||||
/* try a couple times to connect - might get busy signals every
|
||||
now and then */
|
||||
for (count = 0 ; count < 10; ++count) {
|
||||
ret = tm_init(NULL, &tm_root);
|
||||
if (TM_SUCCESS == ret) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
for (progress = 0 ; progress < 10 ; ++progress) {
|
||||
opal_progress();
|
||||
#if HAVE_SCHED_YIELD
|
||||
sched_yield();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
|
||||
/* call this function if the timer fires indicating that one
|
||||
* or more daemons failed to start
|
||||
*/
|
||||
static void failed_start(int fd, short dummy, void *arg)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:failed_start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting, ignore this */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:failed_start - abnormal term in progress",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
}
|
||||
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_UNDEF, 0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
static int obit_submit(int tid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (TM_SUCCESS != (rc = tm_obit(*(tm_task_ids+tid), evs+tid, events_obit+tid))) {
|
||||
opal_output(0, "failed to register termination notice for task %d", tid);
|
||||
rc = ORTE_ERROR;
|
||||
return rc;
|
||||
}
|
||||
if (*(events_obit+tid) == TM_NULL_EVENT) {
|
||||
opal_output(0, "task %d is already dead", tid);
|
||||
} else if (*(events_obit+tid) == TM_ERROR_EVENT) {
|
||||
opal_output(0, "Error on obit return - got error event for task %d", tid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user