Sandbox for next-gen launch
This commit was SVN r18715.
Этот коммит содержится в:
родитель
3f95b906c5
Коммит
3e61a3f92e
0
orte/mca/plm/tmd/.ompi_ignore
Обычный файл
0
orte/mca/plm/tmd/.ompi_ignore
Обычный файл
1
orte/mca/plm/tmd/.ompi_unignore
Обычный файл
1
orte/mca/plm/tmd/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
||||
rhc
|
52
orte/mca/plm/tmd/Makefile.am
Обычный файл
52
orte/mca/plm/tmd/Makefile.am
Обычный файл
@ -0,0 +1,52 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(plm_tmd_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-plm-tmd.txt
|
||||
|
||||
sources = \
|
||||
plm_tmd.h \
|
||||
plm_tmd_component.c \
|
||||
plm_tmd_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_plm_tmd_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_plm_tmd.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_plm_tmd.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_plm_tmd_la_SOURCES = $(component_sources)
|
||||
mca_plm_tmd_la_LDFLAGS = -module -avoid-version $(plm_tmd_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_plm_tmd_la_SOURCES = $(lib_sources)
|
||||
libmca_plm_tmd_la_LDFLAGS = -module -avoid-version $(plm_tmd_LDFLAGS)
|
||||
libmca_plm_tmd_la_LIBADD = $(plm_tmd_LIBS)
|
37
orte/mca/plm/tmd/configure.m4
Обычный файл
37
orte/mca/plm/tmd/configure.m4
Обычный файл
@ -0,0 +1,37 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_plm_tmd_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_plm_tmd_CONFIG],[
|
||||
OMPI_CHECK_TM([plm_tmd], [plm_tmd_good=1], [plm_tmd_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$plm_tmd_good" = "1"],
|
||||
[plm_tmd_WRAPPER_EXTRA_LDFLAGS="$plm_tmd_LDFLAGS"
|
||||
plm_tmd_WRAPPER_EXTRA_LIBS="$plm_tmd_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([plm_tmd_CPPFLAGS])
|
||||
AC_SUBST([plm_tmd_LDFLAGS])
|
||||
AC_SUBST([plm_tmd_LIBS])
|
||||
])dnl
|
22
orte/mca/plm/tmd/configure.params
Обычный файл
22
orte/mca/plm/tmd/configure.params
Обычный файл
@ -0,0 +1,22 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
52
orte/mca/plm/tmd/help-plm-tmd.txt
Обычный файл
52
orte/mca/plm/tmd/help-plm-tmd.txt
Обычный файл
@ -0,0 +1,52 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[tmd-bad-launchid]
|
||||
The TM (PBS / Torque) process starter cannot spawn the specified
|
||||
application on a remote node due to an invalid launch_id.
|
||||
|
||||
This is most likely due to use of the "--hostfile" option to the
|
||||
command line with one or more hosts in that file not having
|
||||
been allocated to this job.
|
||||
|
||||
Removing "--hostfile" from the command line will likely allow the
|
||||
application to be launched.
|
||||
#
|
||||
[multiple-prefixes]
|
||||
Multiple different --prefix options were specified to mpirun for the
|
||||
same node. This is a fatal error for the TM (PBS / Torque) process
|
||||
starter in Open MPI.
|
||||
|
||||
The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[tmd-spawn-failed]
|
||||
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
|
||||
on a remote node.
|
||||
|
||||
Command line: %s
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Ensure that the executable "orted" is in your PATH
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
44
orte/mca/plm/tmd/plm_tmd.h
Обычный файл
44
orte/mca/plm/tmd/plm_tmd.h
Обычный файл
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLM_TMD_EXPORT_H
|
||||
#define ORTE_PLM_TMD_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_plm_tmd_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
bool want_path_check;
|
||||
char *orted;
|
||||
char **checked_paths;
|
||||
};
|
||||
typedef struct orte_plm_tmd_component_t orte_plm_tmd_component_t;
|
||||
|
||||
/* Globally exported variables */
|
||||
ORTE_DECLSPEC extern orte_plm_tmd_component_t mca_plm_tmd_component;
|
||||
extern orte_plm_base_module_t orte_plm_tmd_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLM_TM_EXPORT_H */
|
139
orte/mca/plm/tmd/plm_tmd_component.c
Обычный файл
139
orte/mca/plm/tmd/plm_tmd_component.c
Обычный файл
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_tmd.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_tm component version number
|
||||
*/
|
||||
const char *mca_plm_tmd_component_version_string =
|
||||
"Open MPI tmd plm MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int plm_tmd_open(void);
|
||||
static int plm_tmd_close(void);
|
||||
static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_plm_tmd_component_t mca_plm_tmd_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a plm v1.0.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
ORTE_PLM_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"tmd",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
plm_tmd_open,
|
||||
plm_tmd_close,
|
||||
orte_plm_tmd_component_query
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int plm_tmd_open(void)
|
||||
{
|
||||
int tmp;
|
||||
mca_base_component_t *comp = &mca_plm_tmd_component.super.base_version;
|
||||
|
||||
mca_base_param_reg_string(comp, "orted",
|
||||
"Command to use to start proxy orted",
|
||||
false, false, "orted",
|
||||
&mca_plm_tmd_component.orted);
|
||||
mca_base_param_reg_int(comp, "want_path_check",
|
||||
"Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
|
||||
false, false, (int) true, &tmp);
|
||||
mca_plm_tmd_component.want_path_check = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_plm_tmd_component.checked_paths = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_close(void)
|
||||
{
|
||||
if (NULL != mca_plm_tmd_component.checked_paths) {
|
||||
opal_argv_free(mca_plm_tmd_component.checked_paths);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* Are we running under a TM job? */
|
||||
|
||||
if (NULL != getenv("PBS_ENVIRONMENT") &&
|
||||
NULL != getenv("PBS_JOBID")) {
|
||||
|
||||
*priority = 1;
|
||||
*module = (mca_base_module_t *) &orte_plm_tmd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
554
orte/mca/plm/tmd/plm_tmd_module.c
Обычный файл
554
orte/mca/plm/tmd/plm_tmd_module.c
Обычный файл
@ -0,0 +1,554 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#if HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#ifdef HAVE_SCHED_H
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <tm.h>
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_tmd.h"
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int plm_tmd_init(void);
|
||||
static int plm_tmd_launch_job(orte_job_t *jdata);
|
||||
static int plm_tmd_terminate_job(orte_jobid_t jobid);
|
||||
static int plm_tmd_terminate_orteds(void);
|
||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_tmd_finalize(void);
|
||||
|
||||
static int plm_tmd_connect(void);
|
||||
static int plm_tmd_disconnect(void);
|
||||
static void failed_start(int fd, short event, void *arg);
|
||||
|
||||
/*
|
||||
* Local "global" variables
|
||||
*/
|
||||
static opal_event_t *ev=NULL;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_plm_base_module_t orte_plm_tmd_module = {
|
||||
plm_tmd_init,
|
||||
orte_plm_base_set_hnp_name,
|
||||
plm_tmd_launch_job,
|
||||
NULL,
|
||||
plm_tmd_terminate_job,
|
||||
plm_tmd_terminate_orteds,
|
||||
plm_tmd_signal_job,
|
||||
plm_tmd_finalize
|
||||
};
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
static int plm_tmd_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
int node_name_index;
|
||||
int proc_vpid_index;
|
||||
char *param;
|
||||
char **env = NULL;
|
||||
char *var;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
orte_std_cntr_t launched = 0, i;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
tm_event_t *tm_events = NULL;
|
||||
tm_task_id *tm_task_ids = NULL;
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launching job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Get the map for this job */
|
||||
if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
nodes = (orte_node_t**)map->nodes->addr;
|
||||
|
||||
if (0 == map->num_new_daemons) {
|
||||
/* have all the daemons we need - launch app */
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
argv = opal_argv_split(mca_plm_tmd_component.orted, ' ');
|
||||
argc = opal_argv_count(argv);
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
||||
&proc_vpid_index,
|
||||
&node_name_index,
|
||||
true);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
connected = true;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in plm_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* add our umask -- see big note in orted.c */
|
||||
current_umask = umask(0);
|
||||
umask(current_umask);
|
||||
asprintf(&var, "0%o", current_umask);
|
||||
opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
|
||||
free(var);
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables. We only allow
|
||||
a single prefix to be specified. Since there will
|
||||
always be at least one app_context, we take it from
|
||||
there
|
||||
*/
|
||||
if (NULL != apps[0]->prefix_dir) {
|
||||
char *newenv;
|
||||
|
||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, bin_base, env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, lib_base, env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* For this launch module, we encode all the required launch info
|
||||
* in the daemon's environment. This includes the nidmap for the
|
||||
* daemons, as well as the app_contexts and the map of ranks vs
|
||||
* nodes
|
||||
*/
|
||||
|
||||
/* encode the nidmap */
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
for (i = 0; i < map->num_nodes; i++) {
|
||||
orte_node_t* node = nodes[i];
|
||||
char* vpid_string;
|
||||
|
||||
/* if this daemon already exists, don't launch it! */
|
||||
if (node->daemon_launched) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* setup node name */
|
||||
free(argv[node_name_index]);
|
||||
argv[node_name_index] = strdup(node->name);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launching on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
|
||||
/* setup process name */
|
||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tmd: unable to get daemon vpid as string");
|
||||
exit(-1);
|
||||
}
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(vpid_string);
|
||||
free(vpid_string);
|
||||
|
||||
/* exec the daemon */
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: executing:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
orte_show_help("help-plm-tmd.txt", "tmd-spawn-failed",
|
||||
true, argv[0], node->name, node->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launched++;
|
||||
|
||||
/* Allow some progress to occur */
|
||||
opal_event_loop(OPAL_EVLOOP_NONBLOCK);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:launch: finished spawning orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tmd: failed to poll for a spawned daemon, return status = %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* set a timer to tell us if one or more daemon's fails to start - use the
|
||||
* millisec/daemon timeout provided by the user to compute time
|
||||
*/
|
||||
if (0 < orte_startup_timeout) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: setting startup timer for %d milliseconds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_startup_timeout));
|
||||
ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons,
|
||||
orte_startup_timeout*1000,
|
||||
-1, failed_start);
|
||||
}
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: daemon launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if issued, cancel the failed-to-start timer */
|
||||
if (NULL != ev) {
|
||||
opal_event_del(ev);
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
/* since the daemons have launched, any failures now will be for the
|
||||
* application job
|
||||
*/
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we get here, then everything launched okay - record that fact */
|
||||
failed_launch = false;
|
||||
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
plm_tmd_disconnect();
|
||||
}
|
||||
if (NULL != tm_events) {
|
||||
free(tm_events);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
if (NULL != bin_base) {
|
||||
free(bin_base);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
* the state-of-health of the orteds, if requested AND
|
||||
* we actually launched some daemons!
|
||||
*/
|
||||
if (0 < map->num_new_daemons) {
|
||||
orte_plm_base_start_heart();
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order all of the daemons to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int plm_tmd_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to pass this signal to their local procs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free stuff
|
||||
*/
|
||||
static int plm_tmd_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_connect(void)
|
||||
{
|
||||
int ret;
|
||||
struct tm_roots tm_root;
|
||||
int count, progress;
|
||||
|
||||
/* try a couple times to connect - might get busy signals every
|
||||
now and then */
|
||||
for (count = 0 ; count < 10; ++count) {
|
||||
ret = tm_init(NULL, &tm_root);
|
||||
if (TM_SUCCESS == ret) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
for (progress = 0 ; progress < 10 ; ++progress) {
|
||||
opal_progress();
|
||||
#if HAVE_SCHED_YIELD
|
||||
sched_yield();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_disconnect(void)
|
||||
{
|
||||
tm_finalize();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* call this function if the timer fires indicating that one
|
||||
* or more daemons failed to start
|
||||
*/
|
||||
static void failed_start(int fd, short dummy, void *arg)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:failed_start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting, ignore this */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:failed_start - abnormal term in progress",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
}
|
||||
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user