Take a step back on the slurm and tm launchers. Problems were occurring in the MTT runs, although not under non-MTT scenarios. Preserve the modified plm versions in new components that are ompi_ignored until we can resolve the problems.
This will allow for better MTT coverage until the problem can be better understood. This commit was SVN r20083.
Этот коммит содержится в:
родитель
89792bbc72
Коммит
ce4018efeb
@ -101,10 +101,9 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static pid_t primary_srun_pid = 0;
|
||||
static bool primary_pid_set = false;
|
||||
static pid_t srun_pid = 0;
|
||||
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||
static bool launching_daemons;
|
||||
static bool failed_launch;
|
||||
|
||||
|
||||
/**
|
||||
@ -148,8 +147,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
struct timeval launchstart, launchstop;
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
bool failed_launch=false;
|
||||
|
||||
/* flag the daemons as failing by default */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
@ -163,7 +160,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* indicate the state of the launch */
|
||||
launching_daemons = true;
|
||||
failed_launch = true;
|
||||
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
@ -337,9 +334,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
@ -370,10 +364,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
/* get here if daemons launch okay, or no daemons need to be launched - any
|
||||
* failures now are from launching apps
|
||||
*/
|
||||
launching_daemons = false;
|
||||
/* get here if daemons launch okay - any failures now by apps */
|
||||
failed_job = active_job;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
@ -415,7 +406,7 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -442,10 +433,15 @@ static int plm_slurm_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* tell them to die without sending a reply - we will rely on the
|
||||
* waitpid to tell us when they have exited!
|
||||
/* deregister the waitpid callback to ensure we don't make it look like
|
||||
* srun failed when it didn't. Since the srun may have already completed,
|
||||
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
||||
* messages
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
|
||||
orte_wait_cb_cancel(srun_pid);
|
||||
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -483,8 +479,6 @@ static int plm_slurm_finalize(void)
|
||||
|
||||
|
||||
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* According to the SLURM folks, srun always returns the highest exit
|
||||
code of our remote processes. Thus, a non-zero exit status doesn't
|
||||
necessarily mean that srun failed - it could be that an orted returned
|
||||
@ -505,41 +499,20 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
pid so nobody thinks this is real
|
||||
*/
|
||||
|
||||
/* if we are in the launch phase, then any termination is bad */
|
||||
if (launching_daemons) {
|
||||
/* report that one or more daemons failed to launch so we can exit */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: daemon failed during launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
} else {
|
||||
/* if this is after launch, then we need to abort only if the status
|
||||
* returned is non-zero - i.e., if the orteds exited with an error
|
||||
*/
|
||||
if (0 != status) {
|
||||
if (0 != status) {
|
||||
if (failed_launch) {
|
||||
/* report that the daemon has failed so we can exit
|
||||
*/
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
|
||||
} else {
|
||||
/* an orted must have died unexpectedly after launch - report
|
||||
* that the daemon has failed so we exit
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: daemon failed while running",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
|
||||
}
|
||||
/* otherwise, check to see if this is the primary pid */
|
||||
if (primary_srun_pid == pid) {
|
||||
/* in this case, we just want to fire the proper trigger so
|
||||
* mpirun can exit
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: primary daemons complete!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
/* need to set the #terminated value to avoid an incorrect error msg */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
orte_trigger_event(&orteds_exit);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -547,7 +520,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix)
|
||||
{
|
||||
int fd;
|
||||
int srun_pid;
|
||||
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
||||
|
||||
if (NULL == exec_argv) {
|
||||
@ -651,14 +623,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
/* setup the waitpid so we can find out if srun succeeds! */
|
||||
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
|
||||
free(exec_argv);
|
||||
|
||||
/* if this is the primary launch - i.e., not a comm_spawn of a
|
||||
* child job - then save the pid
|
||||
*/
|
||||
if (!primary_pid_set) {
|
||||
primary_srun_pid = srun_pid;
|
||||
primary_pid_set = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
0
orte/mca/plm/slurmd/.ompi_ignore
Обычный файл
0
orte/mca/plm/slurmd/.ompi_ignore
Обычный файл
1
orte/mca/plm/slurmd/.ompi_unignore
Обычный файл
1
orte/mca/plm/slurmd/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
||||
rhc
|
45
orte/mca/plm/slurmd/Makefile.am
Обычный файл
45
orte/mca/plm/slurmd/Makefile.am
Обычный файл
@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
plm_slurmd.h \
|
||||
plm_slurmd_component.c \
|
||||
plm_slurmd_module.c
|
||||
|
||||
dist_pkgdata_DATA = help-plm-slurmd.txt
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_plm_slurmd_DSO
|
||||
component_noinst =
|
||||
component_install = mca_plm_slurmd.la
|
||||
else
|
||||
component_noinst = libmca_plm_slurmd.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_plm_slurmd_la_SOURCES = $(sources)
|
||||
mca_plm_slurmd_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_plm_slurmd_la_SOURCES =$(sources)
|
||||
libmca_plm_slurmd_la_LDFLAGS = -module -avoid-version
|
37
orte/mca/plm/slurmd/configure.m4
Обычный файл
37
orte/mca/plm/slurmd/configure.m4
Обычный файл
@ -0,0 +1,37 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_plm_slurmd_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_plm_slurmd_CONFIG],[
|
||||
OMPI_CHECK_SLURM([plm_slurmd], [plm_slurmd_good=1], [plm_slurmd_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$plm_slurmd_good" = "1"],
|
||||
[plm_slurmd_WRAPPER_EXTRA_LDFLAGS="$plm_slurmd_LDFLAGS"
|
||||
plm_slurmd_WRAPPER_EXTRA_LIBS="$plm_slurmd_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([plm_slurmd_CPPFLAGS])
|
||||
AC_SUBST([plm_slurmd_LDFLAGS])
|
||||
AC_SUBST([plm_slurmd_LIBS])
|
||||
])dnl
|
22
orte/mca/plm/slurmd/configure.params
Обычный файл
22
orte/mca/plm/slurmd/configure.params
Обычный файл
@ -0,0 +1,22 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
41
orte/mca/plm/slurmd/help-plm-slurmd.txt
Обычный файл
41
orte/mca/plm/slurmd/help-plm-slurmd.txt
Обычный файл
@ -0,0 +1,41 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[multiple-prefixes]
|
||||
The SLURM process starter for Open MPI does not support multiple
|
||||
different --prefix options to mpirun. You can specify at most one
|
||||
unique value for the --prefix option (in any of the application
|
||||
contexts); it will be applied to all the application contexts of your
|
||||
parallel job.
|
||||
|
||||
Put simply, you must have Open MPI installed in the same location on
|
||||
all of your SLURM nodes.
|
||||
|
||||
Multiple different --prefix options were specified to mpirun. This is
|
||||
a fatal error for the SLURM process starter in Open MPI.
|
||||
|
||||
The first two prefix values supplied were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[no-hosts-in-list]
|
||||
The SLURM process starter for Open MPI didn't find any hosts in
|
||||
the map for this application. This can be caused by a lack of
|
||||
an allocation, or by an error in the Open MPI code. Please check
|
||||
to ensure you have a SLURM allocation. If you do, then please pass
|
||||
the error to the Open MPI user's mailing list for assistance.
|
44
orte/mca/plm/slurmd/plm_slurmd.h
Обычный файл
44
orte/mca/plm/slurmd/plm_slurmd.h
Обычный файл
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLM_SLURMD_EXPORT_H
|
||||
#define ORTE_PLM_SLURMD_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_plm_slurmd_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
char *custom_args;
|
||||
};
|
||||
typedef struct orte_plm_slurmd_component_t orte_plm_slurmd_component_t;
|
||||
|
||||
/*
|
||||
* Globally exported variable
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_plm_slurmd_component_t mca_plm_slurmd_component;
|
||||
ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_slurmd_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLM_SLURMD_EXPORT_H */
|
130
orte/mca/plm/slurmd/plm_slurmd_component.c
Обычный файл
130
orte/mca/plm/slurmd/plm_slurmd_component.c
Обычный файл
@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_slurmd.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_slurmd component version number
|
||||
*/
|
||||
const char *mca_plm_slurmd_component_version_string =
|
||||
"Open MPI slurmd plm MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int plm_slurmd_open(void);
|
||||
static int plm_slurmd_close(void);
|
||||
static int orte_plm_slurmd_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_plm_slurmd_component_t mca_plm_slurmd_component = {
|
||||
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
ORTE_PLM_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"slurmd",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
plm_slurmd_open,
|
||||
plm_slurmd_close,
|
||||
orte_plm_slurmd_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
|
||||
/* Other orte_plm_slurmd_component_t items -- left uninitialized
|
||||
here; will be initialized in plm_slurmd_open() */
|
||||
};
|
||||
|
||||
|
||||
static int plm_slurmd_open(void)
|
||||
{
|
||||
mca_base_component_t *comp = &mca_plm_slurmd_component.super.base_version;
|
||||
|
||||
mca_base_param_reg_string(comp, "args",
|
||||
"Custom arguments to srun",
|
||||
false, false, NULL,
|
||||
&mca_plm_slurmd_component.custom_args);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_plm_slurmd_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* Are we running under a SLURM job? */
|
||||
|
||||
if (NULL != getenv("SLURM_JOBID")) {
|
||||
*priority = 2;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: available for selection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
*module = (mca_base_module_t *)&orte_plm_slurmd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
||||
static int plm_slurmd_close(void)
|
||||
{
|
||||
if (NULL != mca_plm_slurmd_component.custom_args) {
|
||||
free(mca_plm_slurmd_component.custom_args);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
665
orte/mca/plm/slurmd/plm_slurmd_module.c
Обычный файл
665
orte/mca/plm/slurmd/plm_slurmd_module.c
Обычный файл
@ -0,0 +1,665 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_slurmd.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int plm_slurmd_init(void);
|
||||
static int plm_slurmd_launch_job(orte_job_t *jdata);
|
||||
static int plm_slurmd_terminate_job(orte_jobid_t jobid);
|
||||
static int plm_slurmd_terminate_orteds(void);
|
||||
static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_slurmd_finalize(void);
|
||||
|
||||
static int plm_slurmd_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix);
|
||||
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_plm_base_module_1_0_0_t orte_plm_slurmd_module = {
|
||||
plm_slurmd_init,
|
||||
orte_plm_base_set_hnp_name,
|
||||
plm_slurmd_launch_job,
|
||||
NULL,
|
||||
plm_slurmd_terminate_job,
|
||||
plm_slurmd_terminate_orteds,
|
||||
plm_slurmd_signal_job,
|
||||
plm_slurmd_finalize
|
||||
};
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static pid_t primary_srun_pid = 0;
|
||||
static bool primary_pid_set = false;
|
||||
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||
static bool launching_daemons;
|
||||
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
static int plm_slurmd_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int plm_slurmd_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t n;
|
||||
orte_job_map_t *map;
|
||||
char *jobid_string = NULL;
|
||||
char *param;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
char *tmp;
|
||||
char** env = NULL;
|
||||
char* var;
|
||||
char *nodelist_flat;
|
||||
char **nodelist_argv;
|
||||
int nodelist_argc;
|
||||
char *name_string;
|
||||
char **custom_strings;
|
||||
int num_args, i;
|
||||
char *cur_prefix;
|
||||
struct timeval launchstart, launchstop;
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
bool failed_launch=false;
|
||||
|
||||
/* flag the daemons as failing by default */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||
opal_output(0, "plm_slurmd: could not obtain job start time");
|
||||
launchstart.tv_sec = 0;
|
||||
launchstart.tv_usec = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* indicate the state of the launch */
|
||||
launching_daemons = true;
|
||||
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: launching job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* set the active jobid */
|
||||
active_job = jdata->jobid;
|
||||
|
||||
/* Get the map for this job */
|
||||
if (NULL == (map = orte_rmaps.get_job_map(active_job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
nodes = (orte_node_t**)map->nodes->addr;
|
||||
|
||||
if (0 == map->num_new_daemons) {
|
||||
/* no new daemons required - just launch apps */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: no new daemons to launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* need integer value for command line parameter */
|
||||
asprintf(&jobid_string, "%lu", (unsigned long) jdata->jobid);
|
||||
|
||||
/*
|
||||
* start building argv array
|
||||
*/
|
||||
argv = NULL;
|
||||
argc = 0;
|
||||
|
||||
/*
|
||||
* SLURM srun OPTIONS
|
||||
*/
|
||||
|
||||
/* add the srun command */
|
||||
opal_argv_append(&argc, &argv, "srun");
|
||||
|
||||
/* Append user defined arguments to srun */
|
||||
if ( NULL != mca_plm_slurmd_component.custom_args ) {
|
||||
custom_strings = opal_argv_split(mca_plm_slurmd_component.custom_args, ' ');
|
||||
num_args = opal_argv_count(custom_strings);
|
||||
for (i = 0; i < num_args; ++i) {
|
||||
opal_argv_append(&argc, &argv, custom_strings[i]);
|
||||
}
|
||||
opal_argv_free(custom_strings);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* alert us if any orteds die during startup */
|
||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||
|
||||
/* create nodelist */
|
||||
nodelist_argv = NULL;
|
||||
nodelist_argc = 0;
|
||||
|
||||
for (n=0; n < map->num_nodes; n++ ) {
|
||||
/* if the daemon already exists on this node, then
|
||||
* don't include it
|
||||
*/
|
||||
if (nodes[n]->daemon_launched) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* otherwise, add it to the list of nodes upon which
|
||||
* we need to launch a daemon
|
||||
*/
|
||||
opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[n]->name);
|
||||
}
|
||||
if (0 == opal_argv_count(nodelist_argv)) {
|
||||
orte_show_help("help-plm-slurmd.txt", "no-hosts-in-list", true);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
}
|
||||
nodelist_flat = opal_argv_join(nodelist_argv, ',');
|
||||
opal_argv_free(nodelist_argv);
|
||||
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
|
||||
"%s plm:slurmd: launching on nodes %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
|
||||
|
||||
/*
|
||||
* ORTED OPTIONS
|
||||
*/
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurmd",
|
||||
&proc_vpid_index,
|
||||
false);
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
* their own name on the other end
|
||||
*/
|
||||
rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm_slurmd: unable to get daemon vpid as string");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(name_string);
|
||||
free(name_string);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
/* Copy the prefix-directory specified in the
|
||||
corresponding app_context. If there are multiple,
|
||||
different prefix's in the app context, complain (i.e., only
|
||||
allow one --prefix option for the entire slurmd run -- we
|
||||
don't support different --prefix'es for different nodes in
|
||||
the SLURM plm) */
|
||||
cur_prefix = NULL;
|
||||
for (n=0; n < jdata->num_apps; n++) {
|
||||
char * app_prefix_dir = apps[n]->prefix_dir;
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
if (NULL != cur_prefix &&
|
||||
0 != strcmp (cur_prefix, app_prefix_dir)) {
|
||||
orte_show_help("help-plm-slurmd.txt", "multiple-prefixes",
|
||||
true, cur_prefix, app_prefix_dir);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/* If not yet set, copy it; iff set, then it's the
|
||||
same anyway */
|
||||
if (NULL == cur_prefix) {
|
||||
cur_prefix = strdup(app_prefix_dir);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: Set prefix:%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
cur_prefix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* add the nodelist */
|
||||
var = mca_base_param_environ_variable("orte", "slurmd", "nodelist");
|
||||
opal_setenv(var, nodelist_flat, true, &env);
|
||||
free(nodelist_flat);
|
||||
free(var);
|
||||
|
||||
/* exec the daemon(s) */
|
||||
if (ORTE_SUCCESS != (rc = plm_slurmd_start_proc(argc, argv, env, cur_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* do NOT wait for srun to complete. Srun only completes when the processes
|
||||
* it starts - in this case, the orteds - complete. Instead, we'll catch
|
||||
* any srun failures and deal with them elsewhere
|
||||
*/
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: daemon launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
/* get here if daemons launch okay, or no daemons need to be launched - any
|
||||
* failures now are from launching apps
|
||||
*/
|
||||
launching_daemons = false;
|
||||
failed_job = active_job;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* declare the launch a success */
|
||||
failed_launch = false;
|
||||
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "plm_slurmd: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "plm_slurmd: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:slurmd: start_procs returned error %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if(NULL != jobid_string) {
|
||||
free(jobid_string);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int plm_slurmd_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
static int plm_slurmd_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* tell them to die without sending a reply - we will rely on the
|
||||
* waitpid to tell us when they have exited!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Signal all the processes in the child srun by sending the signal directly to it
|
||||
*/
|
||||
static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
/* order them to pass this signal to their local procs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int plm_slurmd_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* According to the SLURM folks, srun always returns the highest exit
|
||||
code of our remote processes. Thus, a non-zero exit status doesn't
|
||||
necessarily mean that srun failed - it could be that an orted returned
|
||||
a non-zero exit status. Of course, that means the orted failed(!), so
|
||||
the end result is the same - the job didn't start.
|
||||
|
||||
As a result, we really can't do much with the exit status itself - it
|
||||
could be something in errno (if srun itself failed), or it could be
|
||||
something returned by an orted, or it could be something returned by
|
||||
the OS (e.g., couldn't find the orted binary). Somebody is welcome
|
||||
to sort out all the options and pretty-print a better error message. For
|
||||
now, though, the only thing that really matters is that
|
||||
srun failed. Report the error and make sure that orterun
|
||||
wakes up - otherwise, do nothing!
|
||||
|
||||
Unfortunately, the pid returned here is the srun pid, not the pid of
|
||||
the proc that actually died! So, to avoid confusion, just use -1 as the
|
||||
pid so nobody thinks this is real
|
||||
*/
|
||||
|
||||
/* if we are in the launch phase, then any termination is bad */
|
||||
if (launching_daemons) {
|
||||
/* report that one or more daemons failed to launch so we can exit */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: daemon failed during launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
} else {
|
||||
/* if this is after launch, then we need to abort only if the status
|
||||
* returned is non-zero - i.e., if the orteds exited with an error
|
||||
*/
|
||||
if (0 != status) {
|
||||
/* an orted must have died unexpectedly after launch - report
|
||||
* that the daemon has failed so we exit
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: daemon failed while running",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
|
||||
}
|
||||
/* otherwise, check to see if this is the primary pid */
|
||||
if (primary_srun_pid == pid) {
|
||||
/* in this case, we just want to fire the proper trigger so
|
||||
* mpirun can exit
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: primary daemons complete!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
/* need to set the #terminated value to avoid an incorrect error msg */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
orte_trigger_event(&orteds_exit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int plm_slurmd_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix)
|
||||
{
|
||||
int fd;
|
||||
int srun_pid;
|
||||
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
||||
|
||||
if (NULL == exec_argv) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
srun_pid = fork();
|
||||
if (-1 == srun_pid) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
free(exec_argv);
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
|
||||
if (0 == srun_pid) { /* child */
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There
|
||||
is a lengthy comment about this in plm_rsh_module.c
|
||||
explaining all the rationale for how / why we're doing
|
||||
this. */
|
||||
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables. */
|
||||
if (NULL != prefix) {
|
||||
char *oldenv, *newenv;
|
||||
|
||||
/* Reset PATH */
|
||||
oldenv = getenv("PATH");
|
||||
if (NULL != oldenv) {
|
||||
asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
|
||||
} else {
|
||||
asprintf(&newenv, "%s/%s", prefix, bin_base);
|
||||
}
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: reset PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
free(newenv);
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
oldenv = getenv("LD_LIBRARY_PATH");
|
||||
if (NULL != oldenv) {
|
||||
asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
|
||||
} else {
|
||||
asprintf(&newenv, "%s/%s", prefix, lib_base);
|
||||
}
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurmd: reset LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
|
||||
if(fd > 0) {
|
||||
dup2(fd, 0);
|
||||
}
|
||||
|
||||
/* When not in debug mode and --debug-daemons was not passed,
|
||||
* tie stdout/stderr to dev null so we don't see messages from orted
|
||||
* EXCEPT if the user has requested that we leave sessions attached
|
||||
*/
|
||||
if (0 >= opal_output_get_verbosity(orte_plm_globals.output) &&
|
||||
!orte_debug_daemons_flag && !orte_leave_session_attached) {
|
||||
if (fd >= 0) {
|
||||
if (fd != 1) {
|
||||
dup2(fd,1);
|
||||
}
|
||||
if (fd != 2) {
|
||||
dup2(fd,2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fd > 2) {
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* get the srun process out of orterun's process group so that
|
||||
signals sent from the shell (like those resulting from
|
||||
cntl-c) don't get sent to srun */
|
||||
setpgid(0, 0);
|
||||
|
||||
execve(exec_argv, argv, env);
|
||||
|
||||
opal_output(0, "plm:slurmd:start_proc: exec failed");
|
||||
/* don't return - need to exit - returning would be bad -
|
||||
we're not in the calling process anymore */
|
||||
exit(1);
|
||||
} else { /* parent */
|
||||
/* just in case, make sure that the srun process is not in our
|
||||
process group any more. Stevens says always do this on both
|
||||
sides of the fork... */
|
||||
setpgid(srun_pid, srun_pid);
|
||||
|
||||
/* setup the waitpid so we can find out if srun succeeds! */
|
||||
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
|
||||
free(exec_argv);
|
||||
|
||||
/* if this is the primary launch - i.e., not a comm_spawn of a
|
||||
* child job - then save the pid
|
||||
*/
|
||||
if (!primary_pid_set) {
|
||||
primary_srun_pid = srun_pid;
|
||||
primary_pid_set = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -85,19 +85,13 @@ static int plm_tm_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_tm_finalize(void);
|
||||
|
||||
static int plm_tm_connect(void);
|
||||
static int plm_tm_disconnect(void);
|
||||
static void failed_start(int fd, short event, void *arg);
|
||||
static int obit_submit(int tid);
|
||||
|
||||
/*
|
||||
* Local "global" variables
|
||||
*/
|
||||
static opal_event_t *ev=NULL;
|
||||
static bool connected;
|
||||
static tm_event_t *events_spawn = NULL;
|
||||
static tm_event_t *events_obit = NULL;
|
||||
static tm_task_id *tm_task_ids = NULL;
|
||||
static int *evs = NULL;
|
||||
static bool time_is_up;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -113,20 +107,6 @@ orte_plm_base_module_t orte_plm_tm_module = {
|
||||
plm_tm_finalize
|
||||
};
|
||||
|
||||
/* catch timeout to allow cmds to progress */
|
||||
static void timer_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
opal_event_t *ev = (opal_event_t*)cbdata;
|
||||
|
||||
/* free event */
|
||||
if (NULL != ev) {
|
||||
free(ev);
|
||||
}
|
||||
/* declare time is up */
|
||||
time_is_up = true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
@ -147,7 +127,6 @@ static int plm_tm_init(void)
|
||||
*/
|
||||
static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdatorted;
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
@ -158,23 +137,20 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
char **argv = NULL;
|
||||
int argc = 0;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
orte_std_cntr_t launched = 0, i;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
tm_event_t *tm_events = NULL;
|
||||
tm_task_id *tm_task_ids = NULL;
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
int offset;
|
||||
tm_event_t eventpolled;
|
||||
orte_std_cntr_t num_daemons;
|
||||
opal_event_t *timerev;
|
||||
int j;
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
connected = false;
|
||||
|
||||
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -206,107 +182,20 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* lookup the daemon job object - must do this -after- the job is
|
||||
* setup so the number of required daemons has been updated
|
||||
*/
|
||||
if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
num_daemons = jdatorted->num_procs - 1; /* do not include myself as I am already here! */
|
||||
if (0 >= num_daemons) {
|
||||
/* this won't work */
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use */
|
||||
if (NULL == events_spawn) {
|
||||
/* spawn events for first launch */
|
||||
events_spawn = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_spawn = (tm_event_t*)realloc(events_spawn, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == events_obit) {
|
||||
/* obit events for first launch */
|
||||
events_obit = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_obit = (tm_event_t*)realloc(events_obit, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == evs) {
|
||||
/* evs for first launch */
|
||||
evs = (int*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
evs = (int*)realloc(evs, sizeof(int) * num_daemons);
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* allocate task ids for the orteds */
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
/* first launch */
|
||||
tm_task_ids = (tm_task_id*)malloc(num_daemons * sizeof(tm_task_id));
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
tm_task_ids = (tm_task_id*)realloc(tm_task_ids, sizeof(tm_task_id) * num_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* compute the offset into the event/task arrays */
|
||||
offset = num_daemons - map->num_new_daemons;
|
||||
|
||||
/* initialize them */
|
||||
for (i=0; i < map->num_new_daemons; i++) {
|
||||
*(tm_task_ids + offset + i) = TM_NULL_TASK;
|
||||
*(events_spawn + offset + i) = TM_NULL_EVENT;
|
||||
*(events_obit + offset + i) = TM_NULL_EVENT;
|
||||
*(evs + offset + i) = 0;
|
||||
}
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
@ -382,9 +271,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
@ -406,7 +292,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tm: unable to get daemon vpid as string");
|
||||
goto cleanup;
|
||||
exit(-1);
|
||||
}
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(vpid_string);
|
||||
@ -422,7 +308,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + offset + launched, events_spawn + offset + launched);
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], node->name, node->launch_id);
|
||||
@ -440,54 +326,14 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
"%s plm:tm:launch: finished spawning orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup a timer to give the cmd a chance to be sent */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, launched,
|
||||
100, -1, timer_cb);
|
||||
|
||||
ORTE_PROGRESSED_WAIT(time_is_up, 0, 1);
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
while (0 < launched) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tm: event poll for spawned daemon failed, return status = %d", rc);
|
||||
rc = ORTE_ERROR;
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
continue;
|
||||
}
|
||||
/* look for the spawned event */
|
||||
for (j=0; j < map->num_new_daemons; j++) {
|
||||
if (eventpolled == *(events_spawn + offset + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err) {
|
||||
/* this orted failed to launch! */
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], nodes[j]->name, nodes[j]->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
/* register the corresponding obit so we can detect when this
|
||||
* orted terminates
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(offset+j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND SPAWN EVENT WHEN LAUNCHING");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
|
||||
MOVEON:
|
||||
launched--;
|
||||
}
|
||||
|
||||
/* set a timer to tell us if one or more daemon's fails to start - use the
|
||||
@ -542,6 +388,16 @@ launch_apps:
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
plm_tm_disconnect();
|
||||
}
|
||||
if (NULL != tm_events) {
|
||||
free(tm_events);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
@ -551,7 +407,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
@ -582,14 +438,6 @@ static int plm_tm_terminate_job(orte_jobid_t jobid)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
@ -597,143 +445,12 @@ static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
int plm_tm_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t **daemons;
|
||||
tm_event_t eventpolled;
|
||||
orte_vpid_t j, alive;
|
||||
int local_err;
|
||||
opal_event_t *timerev=NULL;
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
bool aborted;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: terminating orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* lookup the daemon job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
}
|
||||
alive = jdata->num_procs - 1; /* do not include myself! */
|
||||
daemons = (orte_proc_t**)jdata->procs->addr;
|
||||
aborted = false;
|
||||
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* if there are more than just me... */
|
||||
if (0 < alive) {
|
||||
/* setup a max time for the daemons to die */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, alive,
|
||||
1000000, 60000000, timer_cb);
|
||||
|
||||
/* give the cmds a chance to get out */
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, alive, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* now begin polling to see if daemons have terminated */
|
||||
while (!time_is_up && 0 < alive) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: polling for daemon termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tm: event poll for daemon termination failed, return status = %d", rc);
|
||||
continue; /* we will wait for timeout to tell us to quit */
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got null event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
continue;
|
||||
}
|
||||
/* look for the obit event */
|
||||
for (j=0; j < jdata->num_procs-1; j++) {
|
||||
if (eventpolled == *(events_obit + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err == TM_ESYSTEM) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got TM_ESYSTEM on obit - resubmitting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto MOVEON;
|
||||
}
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
if (0 != local_err) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got error %d on obit for task %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_err, j));
|
||||
rc = ORTE_ERROR;
|
||||
goto MOVEON;
|
||||
}
|
||||
/* this daemon has terminated */
|
||||
*(tm_task_ids+j) = TM_NULL_TASK;
|
||||
*(events_obit+j) = TM_NULL_EVENT;
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: task %d exited with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, *(evs+j)));
|
||||
/* update the termination status for this daemon */
|
||||
daemons[j+1]->exit_code = *(evs+j);
|
||||
if (0 != daemons[j+1]->exit_code) {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_ABORTED;
|
||||
aborted = true;
|
||||
} else {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_TERMINATED;
|
||||
}
|
||||
jdata->num_terminated++;
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND OBIT EVENT");
|
||||
|
||||
MOVEON:
|
||||
alive--;
|
||||
}
|
||||
|
||||
/* release event if not already done */
|
||||
if (NULL != quicktime) {
|
||||
free(quicktime);
|
||||
}
|
||||
if (NULL != timerev) {
|
||||
opal_event_del(timerev);
|
||||
free(timerev);
|
||||
}
|
||||
} else {
|
||||
/* still need to give the cmds a chance to get out so I can process
|
||||
* them myself!
|
||||
*/
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 1, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
|
||||
/* declare the daemons done */
|
||||
if (aborted || 0 < alive) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
orte_trigger_event(&orteds_exit);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -762,24 +479,6 @@ static int plm_tm_finalize(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
tm_finalize();
|
||||
}
|
||||
|
||||
/* cleanup data arrays */
|
||||
if (NULL != events_spawn) {
|
||||
free(events_spawn);
|
||||
}
|
||||
if (NULL != events_obit) {
|
||||
free(events_obit);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
if (NULL != evs) {
|
||||
free(evs);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -810,6 +509,13 @@ static int plm_tm_connect(void)
|
||||
}
|
||||
|
||||
|
||||
static int plm_tm_disconnect(void)
|
||||
{
|
||||
tm_finalize();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* call this function if the timer fires indicating that one
|
||||
* or more daemons failed to start
|
||||
*/
|
||||
@ -830,21 +536,3 @@ static void failed_start(int fd, short dummy, void *arg)
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
static int obit_submit(int tid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (TM_SUCCESS != (rc = tm_obit(*(tm_task_ids+tid), evs+tid, events_obit+tid))) {
|
||||
opal_output(0, "failed to register termination notice for task %d", tid);
|
||||
rc = ORTE_ERROR;
|
||||
return rc;
|
||||
}
|
||||
if (*(events_obit+tid) == TM_NULL_EVENT) {
|
||||
opal_output(0, "task %d is already dead", tid);
|
||||
} else if (*(events_obit+tid) == TM_ERROR_EVENT) {
|
||||
opal_output(0, "Error on obit return - got error event for task %d", tid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -30,7 +30,6 @@ BEGIN_C_DECLS
|
||||
struct orte_plm_tmd_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
bool want_path_check;
|
||||
char *orted;
|
||||
char **checked_paths;
|
||||
};
|
||||
typedef struct orte_plm_tmd_component_t orte_plm_tmd_component_t;
|
||||
@ -41,4 +40,4 @@ extern orte_plm_base_module_t orte_plm_tmd_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLM_TM_EXPORT_H */
|
||||
#endif /* ORTE_PLM_TMD_EXPORT_H */
|
||||
|
@ -92,12 +92,8 @@ static int plm_tmd_open(void)
|
||||
int tmp;
|
||||
mca_base_component_t *comp = &mca_plm_tmd_component.super.base_version;
|
||||
|
||||
mca_base_param_reg_string(comp, "orted",
|
||||
"Command to use to start proxy orted",
|
||||
false, false, "orted",
|
||||
&mca_plm_tmd_component.orted);
|
||||
mca_base_param_reg_int(comp, "want_path_check",
|
||||
"Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
|
||||
"Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
|
||||
false, false, (int) true, &tmp);
|
||||
mca_plm_tmd_component.want_path_check = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
@ -124,7 +120,7 @@ static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priorit
|
||||
if (NULL != getenv("PBS_ENVIRONMENT") &&
|
||||
NULL != getenv("PBS_JOBID")) {
|
||||
|
||||
*priority = 1;
|
||||
*priority = 2;
|
||||
*module = (mca_base_module_t *) &orte_plm_tmd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -85,13 +85,19 @@ static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_tmd_finalize(void);
|
||||
|
||||
static int plm_tmd_connect(void);
|
||||
static int plm_tmd_disconnect(void);
|
||||
static void failed_start(int fd, short event, void *arg);
|
||||
static int obit_submit(int tid);
|
||||
|
||||
/*
|
||||
* Local "global" variables
|
||||
*/
|
||||
static opal_event_t *ev=NULL;
|
||||
static bool connected;
|
||||
static tm_event_t *events_spawn = NULL;
|
||||
static tm_event_t *events_obit = NULL;
|
||||
static tm_task_id *tm_task_ids = NULL;
|
||||
static int *evs = NULL;
|
||||
static bool time_is_up;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -107,6 +113,20 @@ orte_plm_base_module_t orte_plm_tmd_module = {
|
||||
plm_tmd_finalize
|
||||
};
|
||||
|
||||
/* catch timeout to allow cmds to progress */
|
||||
static void timer_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
opal_event_t *ev = (opal_event_t*)cbdata;
|
||||
|
||||
/* free event */
|
||||
if (NULL != ev) {
|
||||
free(ev);
|
||||
}
|
||||
/* declare time is up */
|
||||
time_is_up = true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
@ -127,6 +147,7 @@ static int plm_tmd_init(void)
|
||||
*/
|
||||
static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdatorted;
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
@ -135,22 +156,25 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
char **env = NULL;
|
||||
char *var;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int argc = 0;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
orte_std_cntr_t launched = 0, i;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
tm_event_t *tm_events = NULL;
|
||||
tm_task_id *tm_task_ids = NULL;
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
int offset;
|
||||
tm_event_t eventpolled;
|
||||
orte_std_cntr_t num_daemons;
|
||||
opal_event_t *timerev;
|
||||
int j;
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
connected = false;
|
||||
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -158,7 +182,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launching job %s",
|
||||
"%s plm:tm: launching job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
@ -182,23 +206,109 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* lookup the daemon job object - must do this -after- the job is
|
||||
* setup so the number of required daemons has been updated
|
||||
*/
|
||||
if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
num_daemons = jdatorted->num_procs - 1; /* do not include myself as I am already here! */
|
||||
if (0 >= num_daemons) {
|
||||
/* this won't work */
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use */
|
||||
if (NULL == events_spawn) {
|
||||
/* spawn events for first launch */
|
||||
events_spawn = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_spawn = (tm_event_t*)realloc(events_spawn, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_spawn) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == events_obit) {
|
||||
/* obit events for first launch */
|
||||
events_obit = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
events_obit = (tm_event_t*)realloc(events_obit, sizeof(tm_event_t) * num_daemons);
|
||||
if (NULL == events_obit) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == evs) {
|
||||
/* evs for first launch */
|
||||
evs = (int*)malloc(num_daemons * sizeof(tm_event_t));
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
evs = (int*)realloc(evs, sizeof(int) * num_daemons);
|
||||
if (NULL == evs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* allocate task ids for the orteds */
|
||||
if (NULL == tm_task_ids) {
|
||||
/* first launch */
|
||||
tm_task_ids = (tm_task_id*)malloc(num_daemons * sizeof(tm_task_id));
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* comm_spawn launch */
|
||||
tm_task_ids = (tm_task_id*)realloc(tm_task_ids, sizeof(tm_task_id) * num_daemons);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the offset into the event/task arrays */
|
||||
offset = num_daemons - map->num_new_daemons;
|
||||
|
||||
/* initialize them */
|
||||
for (i=0; i < map->num_new_daemons; i++) {
|
||||
*(tm_task_ids + offset + i) = TM_NULL_TASK;
|
||||
*(events_spawn + offset + i) = TM_NULL_EVENT;
|
||||
*(events_obit + offset + i) = TM_NULL_EVENT;
|
||||
*(evs + offset + i) = 0;
|
||||
}
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
argv = opal_argv_split(mca_plm_tmd_component.orted, ' ');
|
||||
argc = opal_argv_count(argv);
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
||||
@ -208,7 +318,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: final top-level argv:\n\t%s",
|
||||
"%s plm:tm: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
@ -251,7 +361,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, bin_base, env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: resetting PATH: %s",
|
||||
"%s plm:tm: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
@ -263,7 +373,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, lib_base, env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: resetting LD_LIBRARY_PATH: %s",
|
||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
@ -272,13 +382,8 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* For this launch module, we encode all the required launch info
|
||||
* in the daemon's environment. This includes the nidmap for the
|
||||
* daemons, as well as the app_contexts and the map of ranks vs
|
||||
* nodes
|
||||
*/
|
||||
|
||||
/* encode the nidmap */
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
@ -293,15 +398,15 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launching on node %s",
|
||||
"%s plm:tm: launching on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
|
||||
/* setup process name */
|
||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tmd: unable to get daemon vpid as string");
|
||||
exit(-1);
|
||||
opal_output(0, "plm:tm: unable to get daemon vpid as string");
|
||||
goto cleanup;
|
||||
}
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(vpid_string);
|
||||
@ -311,15 +416,15 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: executing:\n\t%s",
|
||||
"%s plm:tm: executing:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + offset + launched, events_spawn + offset + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
orte_show_help("help-plm-tmd.txt", "tmd-spawn-failed",
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], node->name, node->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
@ -332,17 +437,57 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:launch: finished spawning orteds",
|
||||
"%s plm:tm:launch: finished spawning orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup a timer to give the cmd a chance to be sent */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, launched,
|
||||
100, -1, timer_cb);
|
||||
|
||||
ORTE_PROGRESSED_WAIT(time_is_up, 0, 1);
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
while (0 < launched) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tmd: failed to poll for a spawned daemon, return status = %d", rc);
|
||||
opal_output(0, "plm:tm: event poll for spawned daemon failed, return status = %d", rc);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
continue;
|
||||
}
|
||||
/* look for the spawned event */
|
||||
for (j=0; j < map->num_new_daemons; j++) {
|
||||
if (eventpolled == *(events_spawn + offset + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err) {
|
||||
/* this orted failed to launch! */
|
||||
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], nodes[j]->name, nodes[j]->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
/* register the corresponding obit so we can detect when this
|
||||
* orted terminates
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(offset+j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND SPAWN EVENT WHEN LAUNCHING");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
|
||||
MOVEON:
|
||||
launched--;
|
||||
}
|
||||
|
||||
/* set a timer to tell us if one or more daemon's fails to start - use the
|
||||
@ -350,7 +495,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
*/
|
||||
if (0 < orte_startup_timeout) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: setting startup timer for %d milliseconds",
|
||||
"%s plm:tm: setting startup timer for %d milliseconds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_startup_timeout));
|
||||
ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons,
|
||||
@ -361,7 +506,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: daemon launch failed for job %s on error %s",
|
||||
"%s plm:tm: daemon launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
@ -379,7 +524,7 @@ launch_apps:
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: launch of apps failed for job %s on error %s",
|
||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
@ -397,16 +542,6 @@ launch_apps:
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
plm_tmd_disconnect();
|
||||
}
|
||||
if (NULL != tm_events) {
|
||||
free(tm_events);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
@ -416,7 +551,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
@ -428,7 +563,7 @@ launch_apps:
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:launch: finished",
|
||||
"%s plm:tm:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
return rc;
|
||||
@ -447,6 +582,14 @@ static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
@ -454,12 +597,143 @@ static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||
int plm_tmd_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t **daemons;
|
||||
tm_event_t eventpolled;
|
||||
orte_vpid_t j, alive;
|
||||
int local_err;
|
||||
opal_event_t *timerev=NULL;
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
bool aborted;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: terminating orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* lookup the daemon job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
}
|
||||
alive = jdata->num_procs - 1; /* do not include myself! */
|
||||
daemons = (orte_proc_t**)jdata->procs->addr;
|
||||
aborted = false;
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) {
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* if there are more than just me... */
|
||||
if (0 < alive) {
|
||||
/* setup a max time for the daemons to die */
|
||||
time_is_up = false;
|
||||
ORTE_DETECT_TIMEOUT(&timerev, alive,
|
||||
1000000, 60000000, timer_cb);
|
||||
|
||||
/* give the cmds a chance to get out */
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, alive, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* now begin polling to see if daemons have terminated */
|
||||
while (!time_is_up && 0 < alive) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: polling for daemon termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tm: event poll for daemon termination failed, return status = %d", rc);
|
||||
continue; /* we will wait for timeout to tell us to quit */
|
||||
}
|
||||
/* if we get back the NULL event, then just continue */
|
||||
if (eventpolled == TM_NULL_EVENT) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got null event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
continue;
|
||||
}
|
||||
/* look for the obit event */
|
||||
for (j=0; j < jdata->num_procs-1; j++) {
|
||||
if (eventpolled == *(events_obit + j)) {
|
||||
/* got the event - check returned code */
|
||||
if (local_err == TM_ESYSTEM) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got TM_ESYSTEM on obit - resubmitting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = obit_submit(j))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto MOVEON;
|
||||
}
|
||||
/* give system a little time to progress */
|
||||
timer_fired = false;
|
||||
opal_evtimer_add(quicktime, &quicktimeval);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
if (0 != local_err) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: got error %d on obit for task %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_err, j));
|
||||
rc = ORTE_ERROR;
|
||||
goto MOVEON;
|
||||
}
|
||||
/* this daemon has terminated */
|
||||
*(tm_task_ids+j) = TM_NULL_TASK;
|
||||
*(events_obit+j) = TM_NULL_EVENT;
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
|
||||
"%s plm:tm: task %d exited with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, *(evs+j)));
|
||||
/* update the termination status for this daemon */
|
||||
daemons[j+1]->exit_code = *(evs+j);
|
||||
if (0 != daemons[j+1]->exit_code) {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_ABORTED;
|
||||
aborted = true;
|
||||
} else {
|
||||
daemons[j+1]->state = ORTE_PROC_STATE_TERMINATED;
|
||||
}
|
||||
jdata->num_terminated++;
|
||||
/* all done with this event */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if we get here, then we failed to find the event */
|
||||
opal_output(0, "TM FAILED TO FIND OBIT EVENT");
|
||||
|
||||
MOVEON:
|
||||
alive--;
|
||||
}
|
||||
|
||||
/* release event if not already done */
|
||||
if (NULL != quicktime) {
|
||||
free(quicktime);
|
||||
}
|
||||
if (NULL != timerev) {
|
||||
opal_event_del(timerev);
|
||||
free(timerev);
|
||||
}
|
||||
} else {
|
||||
/* still need to give the cmds a chance to get out so I can process
|
||||
* them myself!
|
||||
*/
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 1, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
|
||||
/* declare the daemons done */
|
||||
if (aborted || 0 < alive) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
orte_trigger_event(&orteds_exit);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -488,6 +762,24 @@ static int plm_tmd_finalize(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
tm_finalize();
|
||||
}
|
||||
|
||||
/* cleanup data arrays */
|
||||
if (NULL != events_spawn) {
|
||||
free(events_spawn);
|
||||
}
|
||||
if (NULL != events_obit) {
|
||||
free(events_obit);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
if (NULL != evs) {
|
||||
free(evs);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -518,26 +810,19 @@ static int plm_tmd_connect(void)
|
||||
}
|
||||
|
||||
|
||||
static int plm_tmd_disconnect(void)
|
||||
{
|
||||
tm_finalize();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* call this function if the timer fires indicating that one
|
||||
* or more daemons failed to start
|
||||
*/
|
||||
static void failed_start(int fd, short dummy, void *arg)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:failed_start",
|
||||
"%s plm:tm:failed_start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting, ignore this */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd:failed_start - abnormal term in progress",
|
||||
"%s plm:tm:failed_start - abnormal term in progress",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
}
|
||||
@ -545,3 +830,21 @@ static void failed_start(int fd, short dummy, void *arg)
|
||||
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
static int obit_submit(int tid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (TM_SUCCESS != (rc = tm_obit(*(tm_task_ids+tid), evs+tid, events_obit+tid))) {
|
||||
opal_output(0, "failed to register termination notice for task %d", tid);
|
||||
rc = ORTE_ERROR;
|
||||
return rc;
|
||||
}
|
||||
if (*(events_obit+tid) == TM_NULL_EVENT) {
|
||||
opal_output(0, "task %d is already dead", tid);
|
||||
} else if (*(events_obit+tid) == TM_ERROR_EVENT) {
|
||||
opal_output(0, "Error on obit return - got error event for task %d", tid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user