diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 4a36333e41..b73d6c4f85 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -101,10 +101,9 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = { /* * Local variables */ -static pid_t primary_srun_pid = 0; -static bool primary_pid_set = false; +static pid_t srun_pid = 0; static orte_jobid_t active_job = ORTE_JOBID_INVALID; -static bool launching_daemons; +static bool failed_launch; /** @@ -148,8 +147,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata) struct timeval launchstart, launchstop; int proc_vpid_index; orte_jobid_t failed_job; - orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; - bool failed_launch=false; /* flag the daemons as failing by default */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -163,7 +160,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } /* indicate the state of the launch */ - launching_daemons = true; + failed_launch = true; /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { @@ -337,9 +334,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } } - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - /* setup environment */ env = opal_argv_copy(orte_launch_environ); @@ -370,10 +364,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } launch_apps: - /* get here if daemons launch okay, or no daemons need to be launched - any - * failures now are from launching apps - */ - launching_daemons = false; + /* get here if daemons launch okay - any failures now by apps */ failed_job = active_job; if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, @@ -415,7 +406,7 @@ cleanup: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } return rc; @@ -442,10 +433,15 @@ static int plm_slurm_terminate_orteds(void) { int rc; - /* tell them to die without sending a reply - we will rely on the - * waitpid to tell us when they have exited! + /* deregister the waitpid callback to ensure we don't make it look like + * srun failed when it didn't. Since the srun may have already completed, + * do NOT ERROR_LOG any return code to avoid confusing, duplicate error + * messages */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) { + orte_wait_cb_cancel(srun_pid); + + /* tell them to die! */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } @@ -483,8 +479,6 @@ static int plm_slurm_finalize(void) static void srun_wait_cb(pid_t pid, int status, void* cbdata){ - orte_job_t *jdata; - /* According to the SLURM folks, srun always returns the highest exit code of our remote processes. Thus, a non-zero exit status doesn't necessarily mean that srun failed - it could be that an orted returned @@ -505,41 +499,20 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ pid so nobody thinks this is real */ - /* if we are in the launch phase, then any termination is bad */ - if (launching_daemons) { - /* report that one or more daemons failed to launch so we can exit */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: daemon failed during launch", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); - } else { - /* if this is after launch, then we need to abort only if the status - * returned is non-zero - i.e., if the orteds exited with an error - */ - if (0 != status) { + if (0 != status) { + if (failed_launch) { + /* report that the daemon has failed so we can exit + */ + orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); + + } else { /* an orted must have died unexpectedly after launch - report * that the daemon has failed so we exit */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: daemon failed while running", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED); } - /* otherwise, check to see if this is the primary pid */ - if (primary_srun_pid == pid) { - /* in this case, we just want to fire the proper trigger so - * mpirun can exit - */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: primary daemons complete!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - jdata->state = ORTE_JOB_STATE_TERMINATED; - /* need to set the #terminated value to avoid an incorrect error msg */ - jdata->num_terminated = jdata->num_procs; - orte_trigger_event(&orteds_exit); - } } + } @@ -547,7 +520,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env, char *prefix) { int fd; - int srun_pid; char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); if (NULL == exec_argv) { @@ -651,14 +623,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env, /* setup the waitpid so we can find out if srun succeeds! */ orte_wait_cb(srun_pid, srun_wait_cb, NULL); free(exec_argv); - - /* if this is the primary launch - i.e., not a comm_spawn of a - * child job - then save the pid - */ - if (!primary_pid_set) { - primary_srun_pid = srun_pid; - primary_pid_set = true; - } } return ORTE_SUCCESS; diff --git a/orte/mca/plm/slurmd/.ompi_ignore b/orte/mca/plm/slurmd/.ompi_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/orte/mca/plm/slurmd/.ompi_unignore b/orte/mca/plm/slurmd/.ompi_unignore new file mode 100644 index 0000000000..97b20ffb20 --- /dev/null +++ b/orte/mca/plm/slurmd/.ompi_unignore @@ -0,0 +1 @@ +rhc diff --git a/orte/mca/plm/slurmd/Makefile.am b/orte/mca/plm/slurmd/Makefile.am new file mode 100644 index 0000000000..eaeeec6f69 --- /dev/null +++ b/orte/mca/plm/slurmd/Makefile.am @@ -0,0 +1,45 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + plm_slurmd.h \ + plm_slurmd_component.c \ + plm_slurmd_module.c + +dist_pkgdata_DATA = help-plm-slurmd.txt + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_plm_slurmd_DSO +component_noinst = +component_install = mca_plm_slurmd.la +else +component_noinst = libmca_plm_slurmd.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_plm_slurmd_la_SOURCES = $(sources) +mca_plm_slurmd_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_plm_slurmd_la_SOURCES =$(sources) +libmca_plm_slurmd_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/plm/slurmd/configure.m4 b/orte/mca/plm/slurmd/configure.m4 new file mode 100644 index 0000000000..87b9dfc226 --- /dev/null +++ b/orte/mca/plm/slurmd/configure.m4 @@ -0,0 +1,37 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_plm_slurmd_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_plm_slurmd_CONFIG],[ + OMPI_CHECK_SLURM([plm_slurmd], [plm_slurmd_good=1], [plm_slurmd_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$plm_slurmd_good" = "1"], + [plm_slurmd_WRAPPER_EXTRA_LDFLAGS="$plm_slurmd_LDFLAGS" + plm_slurmd_WRAPPER_EXTRA_LIBS="$plm_slurmd_LIBS" + $1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([plm_slurmd_CPPFLAGS]) + AC_SUBST([plm_slurmd_LDFLAGS]) + AC_SUBST([plm_slurmd_LIBS]) +])dnl diff --git a/orte/mca/plm/slurmd/configure.params b/orte/mca/plm/slurmd/configure.params new file mode 100644 index 0000000000..8fc44480a6 --- /dev/null +++ b/orte/mca/plm/slurmd/configure.params @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/plm/slurmd/help-plm-slurmd.txt b/orte/mca/plm/slurmd/help-plm-slurmd.txt new file mode 100644 index 0000000000..e755634506 --- /dev/null +++ b/orte/mca/plm/slurmd/help-plm-slurmd.txt @@ -0,0 +1,41 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[multiple-prefixes] +The SLURM process starter for Open MPI does not support multiple +different --prefix options to mpirun. You can specify at most one +unique value for the --prefix option (in any of the application +contexts); it will be applied to all the application contexts of your +parallel job. + +Put simply, you must have Open MPI installed in the same location on +all of your SLURM nodes. + +Multiple different --prefix options were specified to mpirun. This is +a fatal error for the SLURM process starter in Open MPI. + +The first two prefix values supplied were: + %s +and %s +# +[no-hosts-in-list] +The SLURM process starter for Open MPI didn't find any hosts in +the map for this application. This can be caused by a lack of +an allocation, or by an error in the Open MPI code. Please check +to ensure you have a SLURM allocation. If you do, then please pass +the error to the Open MPI user's mailing list for assistance. diff --git a/orte/mca/plm/slurmd/plm_slurmd.h b/orte/mca/plm/slurmd/plm_slurmd.h new file mode 100644 index 0000000000..428e31b95f --- /dev/null +++ b/orte/mca/plm/slurmd/plm_slurmd.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef ORTE_PLM_SLURMD_EXPORT_H +#define ORTE_PLM_SLURMD_EXPORT_H + +#include "orte_config.h" + +#include "opal/mca/mca.h" +#include "orte/mca/plm/plm.h" + +BEGIN_C_DECLS + +struct orte_plm_slurmd_component_t { + orte_plm_base_component_t super; + char *custom_args; +}; +typedef struct orte_plm_slurmd_component_t orte_plm_slurmd_component_t; + +/* + * Globally exported variable + */ + +ORTE_MODULE_DECLSPEC extern orte_plm_slurmd_component_t mca_plm_slurmd_component; +ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_slurmd_module; + +END_C_DECLS + +#endif /* ORTE_PLM_SLURMD_EXPORT_H */ diff --git a/orte/mca/plm/slurmd/plm_slurmd_component.c b/orte/mca/plm/slurmd/plm_slurmd_component.c new file mode 100644 index 0000000000..eb08824f28 --- /dev/null +++ b/orte/mca/plm/slurmd/plm_slurmd_component.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "orte/util/show_help.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/plm_private.h" +#include "plm_slurmd.h" + + +/* + * Public string showing the plm ompi_slurmd component version number + */ +const char *mca_plm_slurmd_component_version_string = + "Open MPI slurmd plm MCA component version " ORTE_VERSION; + + +/* + * Local functions + */ +static int plm_slurmd_open(void); +static int plm_slurmd_close(void); +static int orte_plm_slurmd_component_query(mca_base_module_t **module, int *priority); + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +orte_plm_slurmd_component_t mca_plm_slurmd_component = { + + { + /* First, the mca_component_t struct containing meta + information about the component itself */ + + { + ORTE_PLM_BASE_VERSION_2_0_0, + + /* Component name and version */ + "slurmd", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + plm_slurmd_open, + plm_slurmd_close, + orte_plm_slurmd_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } + + /* Other orte_plm_slurmd_component_t items -- left uninitialized + here; will be initialized in plm_slurmd_open() */ +}; + + +static int plm_slurmd_open(void) +{ + mca_base_component_t *comp = &mca_plm_slurmd_component.super.base_version; + + mca_base_param_reg_string(comp, "args", + "Custom arguments to srun", + false, false, NULL, + &mca_plm_slurmd_component.custom_args); + + return ORTE_SUCCESS; +} + +static int orte_plm_slurmd_component_query(mca_base_module_t **module, int *priority) +{ + /* Are we running under a SLURM job? */ + + if (NULL != getenv("SLURM_JOBID")) { + *priority = 2; + + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: available for selection", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + *module = (mca_base_module_t *)&orte_plm_slurmd_module; + return ORTE_SUCCESS; + } + + /* Sadly, no */ + *module = NULL; + return ORTE_ERROR; +} + + +static int plm_slurmd_close(void) +{ + if (NULL != mca_plm_slurmd_component.custom_args) { + free(mca_plm_slurmd_component.custom_args); + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/plm/slurmd/plm_slurmd_module.c b/orte/mca/plm/slurmd/plm_slurmd_module.c new file mode 100644 index 0000000000..6a65139045 --- /dev/null +++ b/orte/mca/plm/slurmd/plm_slurmd_module.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif + +#include "opal/mca/installdirs/installdirs.h" +#include "opal/util/argv.h" +#include "opal/util/opal_environ.h" +#include "opal/util/path.h" +#include "opal/util/basename.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/show_help.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/rmaps.h" + +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/plm_private.h" +#include "plm_slurmd.h" + + +/* + * Local functions + */ +static int plm_slurmd_init(void); +static int plm_slurmd_launch_job(orte_job_t *jdata); +static int plm_slurmd_terminate_job(orte_jobid_t jobid); +static int plm_slurmd_terminate_orteds(void); +static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal); +static int plm_slurmd_finalize(void); + +static int plm_slurmd_start_proc(int argc, char **argv, char **env, + char *prefix); + + +/* + * Global variable + */ +orte_plm_base_module_1_0_0_t orte_plm_slurmd_module = { + plm_slurmd_init, + orte_plm_base_set_hnp_name, + plm_slurmd_launch_job, + NULL, + plm_slurmd_terminate_job, + plm_slurmd_terminate_orteds, + plm_slurmd_signal_job, + plm_slurmd_finalize +}; + +/* + * Local variables + */ +static pid_t primary_srun_pid = 0; +static bool primary_pid_set = false; +static orte_jobid_t active_job = ORTE_JOBID_INVALID; +static bool launching_daemons; + + +/** +* Init the module + */ +static int plm_slurmd_init(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + +/* When working in this function, ALWAYS jump to "cleanup" if + * you encounter an error so that orterun will be woken up and + * the job can cleanly terminate + */ +static int plm_slurmd_launch_job(orte_job_t *jdata) +{ + orte_app_context_t **apps; + orte_node_t **nodes; + orte_std_cntr_t n; + orte_job_map_t *map; + char *jobid_string = NULL; + char *param; + char **argv = NULL; + int argc; + int rc; + char *tmp; + char** env = NULL; + char* var; + char *nodelist_flat; + char **nodelist_argv; + int nodelist_argc; + char *name_string; + char **custom_strings; + int num_args, i; + char *cur_prefix; + struct timeval launchstart, launchstop; + int proc_vpid_index; + orte_jobid_t failed_job; + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + bool failed_launch=false; + + /* flag the daemons as failing by default */ + failed_job = ORTE_PROC_MY_NAME->jobid; + + if (orte_timing) { + if (0 != gettimeofday(&launchstart, NULL)) { + opal_output(0, "plm_slurmd: could not obtain job start time"); + launchstart.tv_sec = 0; + launchstart.tv_usec = 0; + } + } + + /* indicate the state of the launch */ + launching_daemons = true; + + /* create a jobid for this job */ + if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: launching job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + + /* setup the job */ + if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* set the active jobid */ + active_job = jdata->jobid; + + /* Get the map for this job */ + if (NULL == (map = orte_rmaps.get_job_map(active_job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + apps = (orte_app_context_t**)jdata->apps->addr; + nodes = (orte_node_t**)map->nodes->addr; + + if (0 == map->num_new_daemons) { + /* no new daemons required - just launch apps */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: no new daemons to launch", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto launch_apps; + } + + /* need integer value for command line parameter */ + asprintf(&jobid_string, "%lu", (unsigned long) jdata->jobid); + + /* + * start building argv array + */ + argv = NULL; + argc = 0; + + /* + * SLURM srun OPTIONS + */ + + /* add the srun command */ + opal_argv_append(&argc, &argv, "srun"); + + /* Append user defined arguments to srun */ + if ( NULL != mca_plm_slurmd_component.custom_args ) { + custom_strings = opal_argv_split(mca_plm_slurmd_component.custom_args, ' '); + num_args = opal_argv_count(custom_strings); + for (i = 0; i < num_args; ++i) { + opal_argv_append(&argc, &argv, custom_strings[i]); + } + opal_argv_free(custom_strings); + } + + asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons); + opal_argv_append(&argc, &argv, tmp); + free(tmp); + + asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons); + opal_argv_append(&argc, &argv, tmp); + free(tmp); + + /* alert us if any orteds die during startup */ + opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); + + /* create nodelist */ + nodelist_argv = NULL; + nodelist_argc = 0; + + for (n=0; n < map->num_nodes; n++ ) { + /* if the daemon already exists on this node, then + * don't include it + */ + if (nodes[n]->daemon_launched) { + continue; + } + + /* otherwise, add it to the list of nodes upon which + * we need to launch a daemon + */ + opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[n]->name); + } + if (0 == opal_argv_count(nodelist_argv)) { + orte_show_help("help-plm-slurmd.txt", "no-hosts-in-list", true); + rc = ORTE_ERR_FAILED_TO_START; + goto cleanup; + } + nodelist_flat = opal_argv_join(nodelist_argv, ','); + opal_argv_free(nodelist_argv); + asprintf(&tmp, "--nodelist=%s", nodelist_flat); + opal_argv_append(&argc, &argv, tmp); + free(tmp); + + OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output, + "%s plm:slurmd: launching on nodes %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); + + /* + * ORTED OPTIONS + */ + + /* add the daemon command (as specified by user) */ + orte_plm_base_setup_orted_cmd(&argc, &argv); + + /* Add basic orted command line options, including debug flags */ + orte_plm_base_orted_append_basic_args(&argc, &argv, + "slurmd", + &proc_vpid_index, + false); + + /* tell the new daemons the base of the name list so they can compute + * their own name on the other end + */ + rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); + if (ORTE_SUCCESS != rc) { + opal_output(0, "plm_slurmd: unable to get daemon vpid as string"); + goto cleanup; + } + + free(argv[proc_vpid_index]); + argv[proc_vpid_index] = strdup(name_string); + free(name_string); + + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { + param = opal_argv_join(argv, ' '); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: final top-level argv:\n\t%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == param) ? "NULL" : param)); + if (NULL != param) free(param); + } + + /* Copy the prefix-directory specified in the + corresponding app_context. If there are multiple, + different prefix's in the app context, complain (i.e., only + allow one --prefix option for the entire slurmd run -- we + don't support different --prefix'es for different nodes in + the SLURM plm) */ + cur_prefix = NULL; + for (n=0; n < jdata->num_apps; n++) { + char * app_prefix_dir = apps[n]->prefix_dir; + /* Check for already set cur_prefix_dir -- if different, + complain */ + if (NULL != app_prefix_dir) { + if (NULL != cur_prefix && + 0 != strcmp (cur_prefix, app_prefix_dir)) { + orte_show_help("help-plm-slurmd.txt", "multiple-prefixes", + true, cur_prefix, app_prefix_dir); + return ORTE_ERR_FATAL; + } + + /* If not yet set, copy it; iff set, then it's the + same anyway */ + if (NULL == cur_prefix) { + cur_prefix = strdup(app_prefix_dir); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: Set prefix:%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + cur_prefix)); + } + } + } + + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + + /* setup environment */ + env = opal_argv_copy(orte_launch_environ); + + /* add the nodelist */ + var = mca_base_param_environ_variable("orte", "slurmd", "nodelist"); + opal_setenv(var, nodelist_flat, true, &env); + free(nodelist_flat); + free(var); + + /* exec the daemon(s) */ + if (ORTE_SUCCESS != (rc = plm_slurmd_start_proc(argc, argv, env, cur_prefix))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* do NOT wait for srun to complete. Srun only completes when the processes + * it starts - in this case, the orteds - complete. Instead, we'll catch + * any srun failures and deal with them elsewhere + */ + + /* wait for daemons to callback */ + if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: daemon launch failed for job %s on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); + goto cleanup; + } + +launch_apps: + /* get here if daemons launch okay, or no daemons need to be launched - any + * failures now are from launching apps + */ + launching_daemons = false; + failed_job = active_job; + if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: launch of apps failed for job %s on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); + goto cleanup; + } + + /* declare the launch a success */ + failed_launch = false; + + if (orte_timing) { + if (0 != gettimeofday(&launchstop, NULL)) { + opal_output(0, "plm_slurmd: could not obtain stop time"); + } else { + opal_output(0, "plm_slurmd: total job launch time is %ld usec", + (launchstop.tv_sec - launchstart.tv_sec)*1000000 + + (launchstop.tv_usec - launchstart.tv_usec)); + } + } + + if (ORTE_SUCCESS != rc) { + opal_output(0, "plm:slurmd: start_procs returned error %d", rc); + goto cleanup; + } + +cleanup: + if (NULL != argv) { + opal_argv_free(argv); + } + if (NULL != env) { + opal_argv_free(env); + } + + if(NULL != jobid_string) { + free(jobid_string); + } + + /* check for failed launch - if so, force terminate */ + if (failed_launch) { + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); + } + + return rc; +} + + +static int plm_slurmd_terminate_job(orte_jobid_t jobid) +{ + int rc; + + /* order them to kill their local procs for this job */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + +/** +* Terminate the orteds for a given job + */ +static int plm_slurmd_terminate_orteds(void) +{ + int rc; + + /* tell them to die without sending a reply - we will rely on the + * waitpid to tell us when they have exited! + */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + +/** + * Signal all the processes in the child srun by sending the signal directly to it + */ +static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal) +{ + int rc = ORTE_SUCCESS; + + /* order them to pass this signal to their local procs */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) { + ORTE_ERROR_LOG(rc); + } + + return rc; +} + + +static int plm_slurmd_finalize(void) +{ + int rc; + + /* cleanup any pending recvs */ + if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) { + ORTE_ERROR_LOG(rc); + } + + return ORTE_SUCCESS; +} + + +static void srun_wait_cb(pid_t pid, int status, void* cbdata){ + orte_job_t *jdata; + + /* According to the SLURM folks, srun always returns the highest exit + code of our remote processes. Thus, a non-zero exit status doesn't + necessarily mean that srun failed - it could be that an orted returned + a non-zero exit status. Of course, that means the orted failed(!), so + the end result is the same - the job didn't start. + + As a result, we really can't do much with the exit status itself - it + could be something in errno (if srun itself failed), or it could be + something returned by an orted, or it could be something returned by + the OS (e.g., couldn't find the orted binary). Somebody is welcome + to sort out all the options and pretty-print a better error message. For + now, though, the only thing that really matters is that + srun failed. Report the error and make sure that orterun + wakes up - otherwise, do nothing! + + Unfortunately, the pid returned here is the srun pid, not the pid of + the proc that actually died! So, to avoid confusion, just use -1 as the + pid so nobody thinks this is real + */ + + /* if we are in the launch phase, then any termination is bad */ + if (launching_daemons) { + /* report that one or more daemons failed to launch so we can exit */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: daemon failed during launch", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); + } else { + /* if this is after launch, then we need to abort only if the status + * returned is non-zero - i.e., if the orteds exited with an error + */ + if (0 != status) { + /* an orted must have died unexpectedly after launch - report + * that the daemon has failed so we exit + */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: daemon failed while running", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED); + } + /* otherwise, check to see if this is the primary pid */ + if (primary_srun_pid == pid) { + /* in this case, we just want to fire the proper trigger so + * mpirun can exit + */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: primary daemons complete!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + jdata->state = ORTE_JOB_STATE_TERMINATED; + /* need to set the #terminated value to avoid an incorrect error msg */ + jdata->num_terminated = jdata->num_procs; + orte_trigger_event(&orteds_exit); + } + } +} + + +static int plm_slurmd_start_proc(int argc, char **argv, char **env, + char *prefix) +{ + int fd; + int srun_pid; + char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); + + if (NULL == exec_argv) { + return ORTE_ERR_NOT_FOUND; + } + + srun_pid = fork(); + if (-1 == srun_pid) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); + free(exec_argv); + return ORTE_ERR_SYS_LIMITS_CHILDREN; + } + + if (0 == srun_pid) { /* child */ + char *bin_base = NULL, *lib_base = NULL; + + /* Figure out the basenames for the libdir and bindir. There + is a lengthy comment about this in plm_rsh_module.c + explaining all the rationale for how / why we're doing + this. */ + + lib_base = opal_basename(opal_install_dirs.libdir); + bin_base = opal_basename(opal_install_dirs.bindir); + + /* If we have a prefix, then modify the PATH and + LD_LIBRARY_PATH environment variables. */ + if (NULL != prefix) { + char *oldenv, *newenv; + + /* Reset PATH */ + oldenv = getenv("PATH"); + if (NULL != oldenv) { + asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv); + } else { + asprintf(&newenv, "%s/%s", prefix, bin_base); + } + opal_setenv("PATH", newenv, true, &env); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: reset PATH: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + newenv)); + free(newenv); + + /* Reset LD_LIBRARY_PATH */ + oldenv = getenv("LD_LIBRARY_PATH"); + if (NULL != oldenv) { + asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv); + } else { + asprintf(&newenv, "%s/%s", prefix, lib_base); + } + opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurmd: reset LD_LIBRARY_PATH: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + newenv)); + free(newenv); + } + + fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666); + if(fd > 0) { + dup2(fd, 0); + } + + /* When not in debug mode and --debug-daemons was not passed, + * tie stdout/stderr to dev null so we don't see messages from orted + * EXCEPT if the user has requested that we leave sessions attached + */ + if (0 >= opal_output_get_verbosity(orte_plm_globals.output) && + !orte_debug_daemons_flag && !orte_leave_session_attached) { + if (fd >= 0) { + if (fd != 1) { + dup2(fd,1); + } + if (fd != 2) { + dup2(fd,2); + } + } + } + + if (fd > 2) { + close(fd); + } + + /* get the srun process out of orterun's process group so that + signals sent from the shell (like those resulting from + cntl-c) don't get sent to srun */ + setpgid(0, 0); + + execve(exec_argv, argv, env); + + opal_output(0, "plm:slurmd:start_proc: exec failed"); + /* don't return - need to exit - returning would be bad - + we're not in the calling process anymore */ + exit(1); + } else { /* parent */ + /* just in case, make sure that the srun process is not in our + process group any more. Stevens says always do this on both + sides of the fork... */ + setpgid(srun_pid, srun_pid); + + /* setup the waitpid so we can find out if srun succeeds! */ + orte_wait_cb(srun_pid, srun_wait_cb, NULL); + free(exec_argv); + + /* if this is the primary launch - i.e., not a comm_spawn of a + * child job - then save the pid + */ + if (!primary_pid_set) { + primary_srun_pid = srun_pid; + primary_pid_set = true; + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 2067d41ff2..fd6f2164a4 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -85,19 +85,13 @@ static int plm_tm_signal_job(orte_jobid_t jobid, int32_t signal); static int plm_tm_finalize(void); static int plm_tm_connect(void); +static int plm_tm_disconnect(void); static void failed_start(int fd, short event, void *arg); -static int obit_submit(int tid); /* * Local "global" variables */ static opal_event_t *ev=NULL; -static bool connected; -static tm_event_t *events_spawn = NULL; -static tm_event_t *events_obit = NULL; -static tm_task_id *tm_task_ids = NULL; -static int *evs = NULL; -static bool time_is_up; /* * Global variable @@ -113,20 +107,6 @@ orte_plm_base_module_t orte_plm_tm_module = { plm_tm_finalize }; -/* catch timeout to allow cmds to progress */ -static void timer_cb(int fd, short event, void *cbdata) -{ - opal_event_t *ev = (opal_event_t*)cbdata; - - /* free event */ - if (NULL != ev) { - free(ev); - } - /* declare time is up */ - time_is_up = true; -} - - /** * Init the module */ @@ -147,7 +127,6 @@ static int plm_tm_init(void) */ static int plm_tm_launch_job(orte_job_t *jdata) { - orte_job_t *jdatorted; orte_job_map_t *map = NULL; orte_app_context_t **apps; orte_node_t **nodes; @@ -158,23 +137,20 @@ static int plm_tm_launch_job(orte_job_t *jdata) char **argv = NULL; int argc = 0; int rc; + bool connected = false; orte_std_cntr_t launched = 0, i; char *bin_base = NULL, *lib_base = NULL; + tm_event_t *tm_events = NULL; + tm_task_id *tm_task_ids = NULL; int local_err; + tm_event_t event; bool failed_launch = true; mode_t current_umask; orte_jobid_t failed_job; - orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; - int offset; - tm_event_t eventpolled; - orte_std_cntr_t num_daemons; - opal_event_t *timerev; - int j; /* default to declaring the daemons as failed */ failed_job = ORTE_PROC_MY_NAME->jobid; - connected = false; - + /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { ORTE_ERROR_LOG(rc); @@ -206,107 +182,20 @@ static int plm_tm_launch_job(orte_job_t *jdata) goto launch_apps; } - /* lookup the daemon job object - must do this -after- the job is - * setup so the number of required daemons has been updated - */ - if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; + /* Allocate a bunch of TM events to use for tm_spawn()ing */ + tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons); + if (NULL == tm_events) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); goto cleanup; } - num_daemons = jdatorted->num_procs - 1; /* do not include myself as I am already here! */ - if (0 >= num_daemons) { - /* this won't work */ - rc = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - - /* Allocate a bunch of TM events to use */ - if (NULL == events_spawn) { - /* spawn events for first launch */ - events_spawn = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t)); - if (NULL == events_spawn) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* comm_spawn launch */ - events_spawn = (tm_event_t*)realloc(events_spawn, sizeof(tm_event_t) * num_daemons); - if (NULL == events_spawn) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - } - if (NULL == events_obit) { - /* obit events for first launch */ - events_obit = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t)); - if (NULL == events_obit) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* comm_spawn launch */ - events_obit = (tm_event_t*)realloc(events_obit, sizeof(tm_event_t) * num_daemons); - if (NULL == events_obit) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - } - if (NULL == evs) { - /* evs for first launch */ - evs = (int*)malloc(num_daemons * sizeof(tm_event_t)); - if (NULL == evs) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* comm_spawn launch */ - evs = (int*)realloc(evs, sizeof(int) * num_daemons); - if (NULL == evs) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - } - - /* allocate task ids for the orteds */ + tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons); if (NULL == tm_task_ids) { - /* first launch */ - tm_task_ids = (tm_task_id*)malloc(num_daemons * sizeof(tm_task_id)); - if (NULL == tm_task_ids) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* comm_spawn launch */ - tm_task_ids = (tm_task_id*)realloc(tm_task_ids, sizeof(tm_task_id) * num_daemons); - if (NULL == tm_task_ids) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; - } + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; } - /* compute the offset into the event/task arrays */ - offset = num_daemons - map->num_new_daemons; - - /* initialize them */ - for (i=0; i < map->num_new_daemons; i++) { - *(tm_task_ids + offset + i) = TM_NULL_TASK; - *(events_spawn + offset + i) = TM_NULL_EVENT; - *(events_obit + offset + i) = TM_NULL_EVENT; - *(evs + offset + i) = 0; - } - /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); @@ -382,9 +271,6 @@ static int plm_tm_launch_job(orte_job_t *jdata) } } - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - /* Iterate through each of the nodes and spin * up a daemon. */ @@ -406,7 +292,7 @@ static int plm_tm_launch_job(orte_job_t *jdata) rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid); if (ORTE_SUCCESS != rc) { opal_output(0, "plm:tm: unable to get daemon vpid as string"); - goto cleanup; + exit(-1); } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); @@ -422,7 +308,7 @@ static int plm_tm_launch_job(orte_job_t *jdata) if (NULL != param) free(param); } - rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + offset + launched, events_spawn + offset + launched); + rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched); if (TM_SUCCESS != rc) { orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, node->launch_id); @@ -440,54 +326,14 @@ static int plm_tm_launch_job(orte_job_t *jdata) "%s plm:tm:launch: finished spawning orteds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup a timer to give the cmd a chance to be sent */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&timerev, launched, - 100, -1, timer_cb); - - ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); - /* TM poll for all the spawns */ - while (0 < launched) { - rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err); + for (i = 0; i < launched; ++i) { + rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); if (TM_SUCCESS != rc) { - opal_output(0, "plm:tm: event poll for spawned daemon failed, return status = %d", rc); - rc = ORTE_ERROR; + errno = local_err; + opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc); goto cleanup; } - /* if we get back the NULL event, then just continue */ - if (eventpolled == TM_NULL_EVENT) { - continue; - } - /* look for the spawned event */ - for (j=0; j < map->num_new_daemons; j++) { - if (eventpolled == *(events_spawn + offset + j)) { - /* got the event - check returned code */ - if (local_err) { - /* this orted failed to launch! */ - orte_show_help("help-plm-tm.txt", "tm-spawn-failed", - true, argv[0], nodes[j]->name, nodes[j]->launch_id); - rc = ORTE_ERROR; - goto cleanup; - } - /* register the corresponding obit so we can detect when this - * orted terminates - */ - if (ORTE_SUCCESS != (rc = obit_submit(offset+j))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* all done with this event */ - goto MOVEON; - } - } - /* if we get here, then we failed to find the event */ - opal_output(0, "TM FAILED TO FIND SPAWN EVENT WHEN LAUNCHING"); - rc = ORTE_ERROR; - goto cleanup; - - MOVEON: - launched--; } /* set a timer to tell us if one or more daemon's fails to start - use the @@ -542,6 +388,16 @@ launch_apps: opal_argv_free(env); } + if (connected) { + plm_tm_disconnect(); + } + if (NULL != tm_events) { + free(tm_events); + } + if (NULL != tm_task_ids) { + free(tm_task_ids); + } + if (NULL != lib_base) { free(lib_base); } @@ -551,7 +407,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } /* setup a "heartbeat" timer to periodically check on @@ -582,14 +438,6 @@ static int plm_tm_terminate_job(orte_jobid_t jobid) return rc; } -/* quick timeout loop */ -static bool timer_fired; - -static void quicktime_cb(int fd, short event, void *cbdata) -{ - /* declare it fired */ - timer_fired = true; -} /** * Terminate the orteds for a given job @@ -597,143 +445,12 @@ static void quicktime_cb(int fd, short event, void *cbdata) int plm_tm_terminate_orteds(void) { int rc; - orte_job_t *jdata; - orte_proc_t **daemons; - tm_event_t eventpolled; - orte_vpid_t j, alive; - int local_err; - opal_event_t *timerev=NULL; - opal_event_t *quicktime=NULL; - struct timeval quicktimeval; - bool aborted; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm: terminating orteds", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* lookup the daemon job object */ - if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - alive = jdata->num_procs - 1; /* do not include myself! */ - daemons = (orte_proc_t**)jdata->procs->addr; - aborted = false; - /* tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) { + /* now tell them to die! */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { ORTE_ERROR_LOG(rc); } - /* if there are more than just me... */ - if (0 < alive) { - /* setup a max time for the daemons to die */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&timerev, alive, - 1000000, 60000000, timer_cb); - - /* give the cmds a chance to get out */ - quicktimeval.tv_sec = 0; - quicktimeval.tv_usec = 100; - timer_fired = false; - ORTE_DETECT_TIMEOUT(&quicktime, alive, 1000, 10000, quicktime_cb); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - - /* now begin polling to see if daemons have terminated */ - while (!time_is_up && 0 < alive) { - OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, - "%s plm:tm: polling for daemon termination", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err); - if (TM_SUCCESS != rc) { - errno = local_err; - opal_output(0, "plm:tm: event poll for daemon termination failed, return status = %d", rc); - continue; /* we will wait for timeout to tell us to quit */ - } - /* if we get back the NULL event, then just continue */ - if (eventpolled == TM_NULL_EVENT) { - OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, - "%s plm:tm: got null event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* give system a little time to progress */ - timer_fired = false; - opal_evtimer_add(quicktime, &quicktimeval); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - continue; - } - /* look for the obit event */ - for (j=0; j < jdata->num_procs-1; j++) { - if (eventpolled == *(events_obit + j)) { - /* got the event - check returned code */ - if (local_err == TM_ESYSTEM) { - OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, - "%s plm:tm: got TM_ESYSTEM on obit - resubmitting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = obit_submit(j))) { - ORTE_ERROR_LOG(rc); - goto MOVEON; - } - /* give system a little time to progress */ - timer_fired = false; - opal_evtimer_add(quicktime, &quicktimeval); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - } - if (0 != local_err) { - OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, - "%s plm:tm: got error %d on obit for task %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_err, j)); - rc = ORTE_ERROR; - goto MOVEON; - } - /* this daemon has terminated */ - *(tm_task_ids+j) = TM_NULL_TASK; - *(events_obit+j) = TM_NULL_EVENT; - OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, - "%s plm:tm: task %d exited with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, *(evs+j))); - /* update the termination status for this daemon */ - daemons[j+1]->exit_code = *(evs+j); - if (0 != daemons[j+1]->exit_code) { - daemons[j+1]->state = ORTE_PROC_STATE_ABORTED; - aborted = true; - } else { - daemons[j+1]->state = ORTE_PROC_STATE_TERMINATED; - } - jdata->num_terminated++; - /* all done with this event */ - goto MOVEON; - } - } - /* if we get here, then we failed to find the event */ - opal_output(0, "TM FAILED TO FIND OBIT EVENT"); - - MOVEON: - alive--; - } - - /* release event if not already done */ - if (NULL != quicktime) { - free(quicktime); - } - if (NULL != timerev) { - opal_event_del(timerev); - free(timerev); - } - } else { - /* still need to give the cmds a chance to get out so I can process - * them myself! - */ - timer_fired = false; - ORTE_DETECT_TIMEOUT(&quicktime, 1, 1000, 10000, quicktime_cb); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - } - - /* declare the daemons done */ - if (aborted || 0 < alive) { - jdata->state = ORTE_JOB_STATE_ABORTED; - } else { - jdata->state = ORTE_JOB_STATE_TERMINATED; - } - orte_trigger_event(&orteds_exit); return rc; } @@ -762,24 +479,6 @@ static int plm_tm_finalize(void) ORTE_ERROR_LOG(rc); } - if (connected) { - tm_finalize(); - } - - /* cleanup data arrays */ - if (NULL != events_spawn) { - free(events_spawn); - } - if (NULL != events_obit) { - free(events_obit); - } - if (NULL != tm_task_ids) { - free(tm_task_ids); - } - if (NULL != evs) { - free(evs); - } - return ORTE_SUCCESS; } @@ -810,6 +509,13 @@ static int plm_tm_connect(void) } +static int plm_tm_disconnect(void) +{ + tm_finalize(); + + return ORTE_SUCCESS; +} + /* call this function if the timer fires indicating that one * or more daemons failed to start */ @@ -830,21 +536,3 @@ static void failed_start(int fd, short dummy, void *arg) orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } - -static int obit_submit(int tid) -{ - int rc; - - if (TM_SUCCESS != (rc = tm_obit(*(tm_task_ids+tid), evs+tid, events_obit+tid))) { - opal_output(0, "failed to register termination notice for task %d", tid); - rc = ORTE_ERROR; - return rc; - } - if (*(events_obit+tid) == TM_NULL_EVENT) { - opal_output(0, "task %d is already dead", tid); - } else if (*(events_obit+tid) == TM_ERROR_EVENT) { - opal_output(0, "Error on obit return - got error event for task %d", tid); - } - - return ORTE_SUCCESS; -} diff --git a/orte/mca/plm/tmd/plm_tmd.h b/orte/mca/plm/tmd/plm_tmd.h index 2a4cb7ecce..942c8f8e1e 100644 --- a/orte/mca/plm/tmd/plm_tmd.h +++ b/orte/mca/plm/tmd/plm_tmd.h @@ -30,7 +30,6 @@ BEGIN_C_DECLS struct orte_plm_tmd_component_t { orte_plm_base_component_t super; bool want_path_check; - char *orted; char **checked_paths; }; typedef struct orte_plm_tmd_component_t orte_plm_tmd_component_t; @@ -41,4 +40,4 @@ extern orte_plm_base_module_t orte_plm_tmd_module; END_C_DECLS -#endif /* ORTE_PLM_TM_EXPORT_H */ +#endif /* ORTE_PLM_TMD_EXPORT_H */ diff --git a/orte/mca/plm/tmd/plm_tmd_component.c b/orte/mca/plm/tmd/plm_tmd_component.c index b682d9f815..b38751c63e 100644 --- a/orte/mca/plm/tmd/plm_tmd_component.c +++ b/orte/mca/plm/tmd/plm_tmd_component.c @@ -92,12 +92,8 @@ static int plm_tmd_open(void) int tmp; mca_base_component_t *comp = &mca_plm_tmd_component.super.base_version; - mca_base_param_reg_string(comp, "orted", - "Command to use to start proxy orted", - false, false, "orted", - &mca_plm_tmd_component.orted); mca_base_param_reg_int(comp, "want_path_check", - "Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)", + "Whether the launching process should check for the plm_tmd_orted executable in the PATH before launching (the TM API does not give an indication of failure; this is a somewhat-lame workaround; non-zero values enable this check)", false, false, (int) true, &tmp); mca_plm_tmd_component.want_path_check = OPAL_INT_TO_BOOL(tmp); @@ -124,7 +120,7 @@ static int orte_plm_tmd_component_query(mca_base_module_t **module, int *priorit if (NULL != getenv("PBS_ENVIRONMENT") && NULL != getenv("PBS_JOBID")) { - *priority = 1; + *priority = 2; *module = (mca_base_module_t *) &orte_plm_tmd_module; return ORTE_SUCCESS; } diff --git a/orte/mca/plm/tmd/plm_tmd_module.c b/orte/mca/plm/tmd/plm_tmd_module.c index a559d23256..6a7acc36db 100644 --- a/orte/mca/plm/tmd/plm_tmd_module.c +++ b/orte/mca/plm/tmd/plm_tmd_module.c @@ -85,13 +85,19 @@ static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal); static int plm_tmd_finalize(void); static int plm_tmd_connect(void); -static int plm_tmd_disconnect(void); static void failed_start(int fd, short event, void *arg); +static int obit_submit(int tid); /* * Local "global" variables */ static opal_event_t *ev=NULL; +static bool connected; +static tm_event_t *events_spawn = NULL; +static tm_event_t *events_obit = NULL; +static tm_task_id *tm_task_ids = NULL; +static int *evs = NULL; +static bool time_is_up; /* * Global variable @@ -107,6 +113,20 @@ orte_plm_base_module_t orte_plm_tmd_module = { plm_tmd_finalize }; +/* catch timeout to allow cmds to progress */ +static void timer_cb(int fd, short event, void *cbdata) +{ + opal_event_t *ev = (opal_event_t*)cbdata; + + /* free event */ + if (NULL != ev) { + free(ev); + } + /* declare time is up */ + time_is_up = true; +} + + /** * Init the module */ @@ -127,6 +147,7 @@ static int plm_tmd_init(void) */ static int plm_tmd_launch_job(orte_job_t *jdata) { + orte_job_t *jdatorted; orte_job_map_t *map = NULL; orte_app_context_t **apps; orte_node_t **nodes; @@ -135,22 +156,25 @@ static int plm_tmd_launch_job(orte_job_t *jdata) char **env = NULL; char *var; char **argv = NULL; - int argc; + int argc = 0; int rc; - bool connected = false; orte_std_cntr_t launched = 0, i; char *bin_base = NULL, *lib_base = NULL; - tm_event_t *tm_events = NULL; - tm_task_id *tm_task_ids = NULL; int local_err; - tm_event_t event; bool failed_launch = true; mode_t current_umask; orte_jobid_t failed_job; + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + int offset; + tm_event_t eventpolled; + orte_std_cntr_t num_daemons; + opal_event_t *timerev; + int j; /* default to declaring the daemons as failed */ failed_job = ORTE_PROC_MY_NAME->jobid; - + connected = false; + /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { ORTE_ERROR_LOG(rc); @@ -158,7 +182,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: launching job %s", + "%s plm:tm: launching job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); @@ -182,23 +206,109 @@ static int plm_tmd_launch_job(orte_job_t *jdata) goto launch_apps; } - /* Allocate a bunch of TM events to use for tm_spawn()ing */ - tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons); - if (NULL == tm_events) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); + /* lookup the daemon job object - must do this -after- the job is + * setup so the number of required daemons has been updated + */ + if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; goto cleanup; } - tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons); - if (NULL == tm_task_ids) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); + num_daemons = jdatorted->num_procs - 1; /* do not include myself as I am already here! */ + if (0 >= num_daemons) { + /* this won't work */ + rc = ORTE_ERR_BAD_PARAM; goto cleanup; } + + /* Allocate a bunch of TM events to use */ + if (NULL == events_spawn) { + /* spawn events for first launch */ + events_spawn = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t)); + if (NULL == events_spawn) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } else { + /* comm_spawn launch */ + events_spawn = (tm_event_t*)realloc(events_spawn, sizeof(tm_event_t) * num_daemons); + if (NULL == events_spawn) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + } + if (NULL == events_obit) { + /* obit events for first launch */ + events_obit = (tm_event_t*)malloc(num_daemons * sizeof(tm_event_t)); + if (NULL == events_obit) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } else { + /* comm_spawn launch */ + events_obit = (tm_event_t*)realloc(events_obit, sizeof(tm_event_t) * num_daemons); + if (NULL == events_obit) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + } + if (NULL == evs) { + /* evs for first launch */ + evs = (int*)malloc(num_daemons * sizeof(tm_event_t)); + if (NULL == evs) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } else { + /* comm_spawn launch */ + evs = (int*)realloc(evs, sizeof(int) * num_daemons); + if (NULL == evs) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + } + /* allocate task ids for the orteds */ + if (NULL == tm_task_ids) { + /* first launch */ + tm_task_ids = (tm_task_id*)malloc(num_daemons * sizeof(tm_task_id)); + if (NULL == tm_task_ids) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } else { + /* comm_spawn launch */ + tm_task_ids = (tm_task_id*)realloc(tm_task_ids, sizeof(tm_task_id) * num_daemons); + if (NULL == tm_task_ids) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + + /* compute the offset into the event/task arrays */ + offset = num_daemons - map->num_new_daemons; + + /* initialize them */ + for (i=0; i < map->num_new_daemons; i++) { + *(tm_task_ids + offset + i) = TM_NULL_TASK; + *(events_spawn + offset + i) = TM_NULL_EVENT; + *(events_obit + offset + i) = TM_NULL_EVENT; + *(evs + offset + i) = 0; + } + /* add the daemon command (as specified by user) */ - argv = opal_argv_split(mca_plm_tmd_component.orted, ' '); - argc = opal_argv_count(argv); + orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", @@ -208,7 +318,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: final top-level argv:\n\t%s", + "%s plm:tm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); @@ -251,7 +361,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) asprintf(&newenv, "%s/%s:%s", apps[0]->prefix_dir, bin_base, env[i] + 5); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: resetting PATH: %s", + "%s plm:tm: resetting PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("PATH", newenv, true, &env); @@ -263,7 +373,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) asprintf(&newenv, "%s/%s:%s", apps[0]->prefix_dir, lib_base, env[i] + 16); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: resetting LD_LIBRARY_PATH: %s", + "%s plm:tm: resetting LD_LIBRARY_PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); @@ -272,13 +382,8 @@ static int plm_tmd_launch_job(orte_job_t *jdata) } } - /* For this launch module, we encode all the required launch info - * in the daemon's environment. This includes the nidmap for the - * daemons, as well as the app_contexts and the map of ranks vs - * nodes - */ - - /* encode the nidmap */ + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; /* Iterate through each of the nodes and spin * up a daemon. @@ -293,15 +398,15 @@ static int plm_tmd_launch_job(orte_job_t *jdata) } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: launching on node %s", + "%s plm:tm: launching on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); /* setup process name */ rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid); if (ORTE_SUCCESS != rc) { - opal_output(0, "plm:tmd: unable to get daemon vpid as string"); - exit(-1); + opal_output(0, "plm:tm: unable to get daemon vpid as string"); + goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); @@ -311,15 +416,15 @@ static int plm_tmd_launch_job(orte_job_t *jdata) if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: executing:\n\t%s", + "%s plm:tm: executing:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } - rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched); + rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + offset + launched, events_spawn + offset + launched); if (TM_SUCCESS != rc) { - orte_show_help("help-plm-tmd.txt", "tmd-spawn-failed", + orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, node->launch_id); rc = ORTE_ERROR; goto cleanup; @@ -332,17 +437,57 @@ static int plm_tmd_launch_job(orte_job_t *jdata) } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd:launch: finished spawning orteds", + "%s plm:tm:launch: finished spawning orteds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* setup a timer to give the cmd a chance to be sent */ + time_is_up = false; + ORTE_DETECT_TIMEOUT(&timerev, launched, + 100, -1, timer_cb); + + ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); + /* TM poll for all the spawns */ - for (i = 0; i < launched; ++i) { - rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); + while (0 < launched) { + rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err); if (TM_SUCCESS != rc) { - errno = local_err; - opal_output(0, "plm:tmd: failed to poll for a spawned daemon, return status = %d", rc); + opal_output(0, "plm:tm: event poll for spawned daemon failed, return status = %d", rc); + rc = ORTE_ERROR; goto cleanup; } + /* if we get back the NULL event, then just continue */ + if (eventpolled == TM_NULL_EVENT) { + continue; + } + /* look for the spawned event */ + for (j=0; j < map->num_new_daemons; j++) { + if (eventpolled == *(events_spawn + offset + j)) { + /* got the event - check returned code */ + if (local_err) { + /* this orted failed to launch! */ + orte_show_help("help-plm-tm.txt", "tm-spawn-failed", + true, argv[0], nodes[j]->name, nodes[j]->launch_id); + rc = ORTE_ERROR; + goto cleanup; + } + /* register the corresponding obit so we can detect when this + * orted terminates + */ + if (ORTE_SUCCESS != (rc = obit_submit(offset+j))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* all done with this event */ + goto MOVEON; + } + } + /* if we get here, then we failed to find the event */ + opal_output(0, "TM FAILED TO FIND SPAWN EVENT WHEN LAUNCHING"); + rc = ORTE_ERROR; + goto cleanup; + + MOVEON: + launched--; } /* set a timer to tell us if one or more daemon's fails to start - use the @@ -350,7 +495,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) */ if (0 < orte_startup_timeout) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: setting startup timer for %d milliseconds", + "%s plm:tm: setting startup timer for %d milliseconds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_startup_timeout)); ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons, @@ -361,7 +506,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) /* wait for daemons to callback */ if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: daemon launch failed for job %s on error %s", + "%s plm:tm: daemon launch failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); goto cleanup; @@ -379,7 +524,7 @@ launch_apps: failed_job = jdata->jobid; if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd: launch of apps failed for job %s on error %s", + "%s plm:tm: launch of apps failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); goto cleanup; @@ -397,16 +542,6 @@ launch_apps: opal_argv_free(env); } - if (connected) { - plm_tmd_disconnect(); - } - if (NULL != tm_events) { - free(tm_events); - } - if (NULL != tm_task_ids) { - free(tm_task_ids); - } - if (NULL != lib_base) { free(lib_base); } @@ -416,7 +551,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } /* setup a "heartbeat" timer to periodically check on @@ -428,7 +563,7 @@ launch_apps: } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd:launch: finished", + "%s plm:tm:launch: finished", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return rc; @@ -447,6 +582,14 @@ static int plm_tmd_terminate_job(orte_jobid_t jobid) return rc; } +/* quick timeout loop */ +static bool timer_fired; + +static void quicktime_cb(int fd, short event, void *cbdata) +{ + /* declare it fired */ + timer_fired = true; +} /** * Terminate the orteds for a given job @@ -454,12 +597,143 @@ static int plm_tmd_terminate_job(orte_jobid_t jobid) int plm_tmd_terminate_orteds(void) { int rc; + orte_job_t *jdata; + orte_proc_t **daemons; + tm_event_t eventpolled; + orte_vpid_t j, alive; + int local_err; + opal_event_t *timerev=NULL; + opal_event_t *quicktime=NULL; + struct timeval quicktimeval; + bool aborted; + + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:tm: terminating orteds", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* lookup the daemon job object */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + alive = jdata->num_procs - 1; /* do not include myself! */ + daemons = (orte_proc_t**)jdata->procs->addr; + aborted = false; - /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { + /* tell them to die! */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } + /* if there are more than just me... */ + if (0 < alive) { + /* setup a max time for the daemons to die */ + time_is_up = false; + ORTE_DETECT_TIMEOUT(&timerev, alive, + 1000000, 60000000, timer_cb); + + /* give the cmds a chance to get out */ + quicktimeval.tv_sec = 0; + quicktimeval.tv_usec = 100; + timer_fired = false; + ORTE_DETECT_TIMEOUT(&quicktime, alive, 1000, 10000, quicktime_cb); + ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + + /* now begin polling to see if daemons have terminated */ + while (!time_is_up && 0 < alive) { + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, + "%s plm:tm: polling for daemon termination", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + rc = tm_poll(TM_NULL_EVENT, &eventpolled, (int)false, &local_err); + if (TM_SUCCESS != rc) { + errno = local_err; + opal_output(0, "plm:tm: event poll for daemon termination failed, return status = %d", rc); + continue; /* we will wait for timeout to tell us to quit */ + } + /* if we get back the NULL event, then just continue */ + if (eventpolled == TM_NULL_EVENT) { + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, + "%s plm:tm: got null event", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* give system a little time to progress */ + timer_fired = false; + opal_evtimer_add(quicktime, &quicktimeval); + ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + continue; + } + /* look for the obit event */ + for (j=0; j < jdata->num_procs-1; j++) { + if (eventpolled == *(events_obit + j)) { + /* got the event - check returned code */ + if (local_err == TM_ESYSTEM) { + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, + "%s plm:tm: got TM_ESYSTEM on obit - resubmitting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (ORTE_SUCCESS != (rc = obit_submit(j))) { + ORTE_ERROR_LOG(rc); + goto MOVEON; + } + /* give system a little time to progress */ + timer_fired = false; + opal_evtimer_add(quicktime, &quicktimeval); + ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + } + if (0 != local_err) { + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, + "%s plm:tm: got error %d on obit for task %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_err, j)); + rc = ORTE_ERROR; + goto MOVEON; + } + /* this daemon has terminated */ + *(tm_task_ids+j) = TM_NULL_TASK; + *(events_obit+j) = TM_NULL_EVENT; + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output, + "%s plm:tm: task %d exited with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, *(evs+j))); + /* update the termination status for this daemon */ + daemons[j+1]->exit_code = *(evs+j); + if (0 != daemons[j+1]->exit_code) { + daemons[j+1]->state = ORTE_PROC_STATE_ABORTED; + aborted = true; + } else { + daemons[j+1]->state = ORTE_PROC_STATE_TERMINATED; + } + jdata->num_terminated++; + /* all done with this event */ + goto MOVEON; + } + } + /* if we get here, then we failed to find the event */ + opal_output(0, "TM FAILED TO FIND OBIT EVENT"); + + MOVEON: + alive--; + } + + /* release event if not already done */ + if (NULL != quicktime) { + free(quicktime); + } + if (NULL != timerev) { + opal_event_del(timerev); + free(timerev); + } + } else { + /* still need to give the cmds a chance to get out so I can process + * them myself! + */ + timer_fired = false; + ORTE_DETECT_TIMEOUT(&quicktime, 1, 1000, 10000, quicktime_cb); + ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + } + + /* declare the daemons done */ + if (aborted || 0 < alive) { + jdata->state = ORTE_JOB_STATE_ABORTED; + } else { + jdata->state = ORTE_JOB_STATE_TERMINATED; + } + orte_trigger_event(&orteds_exit); return rc; } @@ -488,6 +762,24 @@ static int plm_tmd_finalize(void) ORTE_ERROR_LOG(rc); } + if (connected) { + tm_finalize(); + } + + /* cleanup data arrays */ + if (NULL != events_spawn) { + free(events_spawn); + } + if (NULL != events_obit) { + free(events_obit); + } + if (NULL != tm_task_ids) { + free(tm_task_ids); + } + if (NULL != evs) { + free(evs); + } + return ORTE_SUCCESS; } @@ -518,26 +810,19 @@ static int plm_tmd_connect(void) } -static int plm_tmd_disconnect(void) -{ - tm_finalize(); - - return ORTE_SUCCESS; -} - /* call this function if the timer fires indicating that one * or more daemons failed to start */ static void failed_start(int fd, short dummy, void *arg) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd:failed_start", + "%s plm:tm:failed_start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are aborting, ignore this */ if (orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tmd:failed_start - abnormal term in progress", + "%s plm:tm:failed_start - abnormal term in progress", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } @@ -545,3 +830,21 @@ static void failed_start(int fd, short dummy, void *arg) orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } + +static int obit_submit(int tid) +{ + int rc; + + if (TM_SUCCESS != (rc = tm_obit(*(tm_task_ids+tid), evs+tid, events_obit+tid))) { + opal_output(0, "failed to register termination notice for task %d", tid); + rc = ORTE_ERROR; + return rc; + } + if (*(events_obit+tid) == TM_NULL_EVENT) { + opal_output(0, "task %d is already dead", tid); + } else if (*(events_obit+tid) == TM_ERROR_EVENT) { + opal_output(0, "Error on obit return - got error event for task %d", tid); + } + + return ORTE_SUCCESS; +}