diff --git a/orte/mca/plm/submit/.ompi_ignore b/orte/mca/plm/submit/.ompi_ignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/orte/mca/plm/submit/Makefile.am b/orte/mca/plm/submit/Makefile.am deleted file mode 100644 index bf9ddf3922..0000000000 --- a/orte/mca/plm/submit/Makefile.am +++ /dev/null @@ -1,46 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_pkgdata_DATA = help-plm-rsh.txt - -sources = \ - plm_rsh.h \ - plm_rsh_component.c \ - plm_rsh_module.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_plm_rsh_DSO -component_noinst = -component_install = mca_plm_rsh.la -else -component_noinst = libmca_plm_rsh.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_plm_rsh_la_SOURCES = $(sources) -mca_plm_rsh_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_plm_rsh_la_SOURCES =$(sources) -libmca_plm_rsh_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/plm/submit/configure.m4 b/orte/mca/plm/submit/configure.m4 deleted file mode 100644 index 4a40323f7b..0000000000 --- a/orte/mca/plm/submit/configure.m4 +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2006 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_plm_submit_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_plm_submit_CONFIG],[ - AC_CONFIG_FILES([orte/mca/plm/submit/Makefile]) - - AC_CHECK_FUNC([fork], [plm_submit_happy="yes"], [plm_submit_happy="no"]) - - AS_IF([test "$plm_submit_happy" = "yes" -a "$orte_without_full_support" = 0], [$1], [$2]) -])dnl diff --git a/orte/mca/plm/submit/help-pls-submit.txt b/orte/mca/plm/submit/help-pls-submit.txt deleted file mode 100644 index 6202434fcd..0000000000 --- a/orte/mca/plm/submit/help-pls-submit.txt +++ /dev/null @@ -1,66 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for Open RTE's orterun. -# -[no-local-orted] -The rsh PLS component was not able to find the executable "orted" in -your PATH or in the directory where Open MPI/OpenRTE was initially installed, -and therefore cannot continue. - -For reference, your current PATH is: - - %s - -We also looked for orted in the following directory: - - %s - -[multiple-prefixes] -Specified multiple application contexts using different -settings for --prefix. Care should be taken, that corresponding -processes are mapped to different nodes. Having multiple prefixes -per node is not allowed. - -The previously set prefix was - %s - -the prefix to be set overriding: - %s - -[concurrency-less-than-zero] -The value of the MCA parameter "pls_rsh_num_concurrent" is less than -or equal to zero (%d). This parameter is used to determine how many -remote agents (typically rsh or ssh) to invoke concurrently while -launching parallel jobs. - -This value has automatically be reset to 1; processing will continue. - -[deadlock-params] -The rsh launcher has been given a number of %d concurrent daemons to -launch and is in a debug-daemons option. However, the total number of -daemons to launch (%d) is greater than this value. This is a scenario that -will cause the system to deadlock. - -To avoid deadlock, either increase the number of concurrent daemons, or -remove the debug-daemons flag. - -[unknown-user] -The user (%d) is unknown to the system (i.e. there is no corresponding -entry in the password file). Please contact your system administrator -for a fix. diff --git a/orte/mca/plm/submit/pls_submit.h b/orte/mca/plm/submit/pls_submit.h deleted file mode 100644 index 60eb4c599f..0000000000 --- a/orte/mca/plm/submit/pls_submit.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2008 The Trustees of Indiana University. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file: - * Part of the submit launcher. See plm_submit.h for an overview of how it works. - */ - -#ifndef ORTE_PLM_SUBMIT_EXPORT_H -#define ORTE_PLM_SUBMIT_EXPORT_H - -#include "orte_config.h" - -#include "opal/mca/mca.h" -#include "orte/mca/plm/plm.h" -#include "opal/threads/condition.h" - -BEGIN_C_DECLS - -/* - * Module open / close - */ -int orte_plm_submit_component_open(void); -int orte_plm_submit_component_close(void); -int orte_plm_submit_component_query(mca_base_module_t **module, int *priority); - -/* - * Startup / Shutdown - */ -int orte_plm_submit_finalize(void); - -/* - * Interface - */ -int orte_plm_submit_launch(orte_job_t*); -int orte_plm_submit_terminate_orteds(void); -int orte_plm_submit_signal_job(orte_jobid_t, int32_t); - -/** - * PLM Component - */ -struct orte_plm_submit_component_t { - orte_plm_base_component_t super; - bool debug; - bool debug_daemons; - bool timing; - int delay; - int priority; - char *agent_param; - char** agent_argv; - int agent_argc; - char* agent_path; - char* orted; - orte_std_cntr_t num_children; - orte_std_cntr_t num_concurrent; - opal_mutex_t lock; - opal_condition_t cond; -}; -typedef struct orte_plm_submit_component_t orte_plm_submit_component_t; - -ORTE_MODULE_DECLSPEC extern orte_plm_submit_component_t mca_plm_submit_component; -extern orte_plm_base_module_t orte_plm_submit_module; - -END_C_DECLS - -#endif /* ORTE_PLM_SUBMIT_EXPORT_H */ diff --git a/orte/mca/plm/submit/pls_submit_component.c b/orte/mca/plm/submit/pls_submit_component.c deleted file mode 100644 index fc8fb4a743..0000000000 --- a/orte/mca/plm/submit/pls_submit_component.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2008 The Trustees of Indiana University. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#include - -#include "opal/util/opal_environ.h" -#include "opal/util/argv.h" -#include "opal/util/path.h" -#include "opal/util/basename.h" -#include "opal/mca/base/mca_base_param.h" - - -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/submit/plm_submit.h" - -/* - * Local function - */ -static char **search(const char* agent_list); - -/* - * Public string showing the plm ompi_submit component version number - */ -const char *mca_plm_submit_component_version_string = - "Open MPI submit plm MCA component version " ORTE_VERSION; - - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -orte_plm_submit_component_t mca_plm_submit_component = { - { - /* First, the mca_component_t struct containing meta information - about the component itself */ - - { - ORTE_PLM_BASE_VERSION_2_0_0, - - /* Component name and version */ - "submit", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_plm_submit_component_open, - orte_plm_submit_component_close, - orte_plm_submit_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } - } -}; - - - -int orte_plm_submit_component_open(void) -{ - int tmp, value; - mca_base_component_t *c = &mca_plm_submit_component.super.base_version; - - /* initialize globals */ - OBJ_CONSTRUCT(&mca_plm_submit_component.lock, opal_mutex_t); - OBJ_CONSTRUCT(&mca_plm_submit_component.cond, opal_condition_t); - mca_plm_submit_component.num_children = 0; - mca_plm_submit_component.agent_argv = NULL; - mca_plm_submit_component.agent_argc = 0; - mca_plm_submit_component.agent_path = NULL; - - /* lookup parameters */ - mca_base_param_reg_int(c, "debug", - "Whether or not to enable debugging output for the submit plm component (0 or 1)", - false, false, false, &tmp); - mca_plm_submit_component.debug = OPAL_INT_TO_BOOL(tmp); - mca_base_param_reg_int(c, "num_concurrent", - "How many plm_submit_agent instances to invoke concurrently (must be > 0)", - false, false, 128, &tmp); - if (tmp <= 0) { - orte_show_help("help-plm-submit.txt", "concurrency-less-than-zero", - true, tmp); - tmp = 1; - } - mca_plm_submit_component.num_concurrent = tmp; - - if (mca_plm_submit_component.debug == 0) { - mca_base_param_reg_int_name("orte", "debug", - "Whether or not to enable debugging output for all ORTE components (0 or 1)", - false, false, false, &tmp); - mca_plm_submit_component.debug = OPAL_INT_TO_BOOL(tmp); - } - mca_base_param_reg_int_name("orte", "debug_daemons", - "Whether or not to enable debugging of daemons (0 or 1)", - false, false, false, &tmp); - mca_plm_submit_component.debug_daemons = OPAL_INT_TO_BOOL(tmp); - - tmp = mca_base_param_reg_int_name("orte", "timing", - "Request that critical timing loops be measured", - false, false, 0, &value); - if (value != 0) { - mca_plm_submit_component.timing = true; - } else { - mca_plm_submit_component.timing = false; - } - - mca_base_param_reg_string(c, "orted", - "The command name that the submit plm component will invoke for the ORTE daemon", - false, false, "orted", - &mca_plm_submit_component.orted); - - mca_base_param_reg_int(c, "priority", - "Priority of the submit plm component", - false, false, 10, - &mca_plm_submit_component.priority); - mca_base_param_reg_int(c, "delay", - "Delay (in seconds) between invocations of the remote agent, but only used when the \"debug\" MCA parameter is true, or the top-level MCA debugging is enabled (otherwise this value is ignored)", - false, false, 1, - &mca_plm_submit_component.delay); - mca_base_param_reg_int(c, "assume_same_shell", - "If set to 1, assume that the shell on the remote node is the same as the shell on the local node. Otherwise, probe for what the remote shell.", - false, false, 1, &tmp); - mca_plm_submit_component.assume_same_shell = OPAL_INT_TO_BOOL(tmp); - - mca_base_param_reg_string(c, "agent", - "The command used to launch executables on remote nodes (typically either \"ssh\" or \"submit\")", - false, false, "ssh : submit", - &mca_plm_submit_component.agent_param); - - return ORTE_SUCCESS; -} - - -int orte_plm_submit_component_query(mca_base_module_t **module, int *priority) -{ - char *bname; - size_t i; - - /* Take the string that was given to us by the pla_submit_agent MCA - param and search for it */ - mca_plm_submit_component.agent_argv = - search(mca_plm_submit_component.agent_param); - mca_plm_submit_component.agent_argc = - opal_argv_count(mca_plm_submit_component.agent_argv); - mca_plm_submit_component.agent_path = NULL; - if (mca_plm_submit_component.agent_argc > 0) { - /* If the agent is ssh, and debug was not selected, then - automatically add "-x" */ - - bname = opal_basename(mca_plm_submit_component.agent_argv[0]); - if (NULL != bname && 0 == strcmp(bname, "ssh") && - mca_plm_submit_component.debug == 0) { - for (i = 1; NULL != mca_plm_submit_component.agent_argv[i]; ++i) { - if (0 == strcasecmp("-x", - mca_plm_submit_component.agent_argv[i])) { - break; - } - } - if (NULL == mca_plm_submit_component.agent_argv[i]) { - opal_argv_append(&mca_plm_submit_component.agent_argc, - &mca_plm_submit_component.agent_argv, "-x"); - } - } - if (NULL != bname) { - free(bname); - } - } - - /* If we didn't find the agent in the path, then don't use this - component */ - if (NULL == mca_plm_submit_component.agent_argv || - NULL == mca_plm_submit_component.agent_argv[0]) { - *module = NULL; - return ORTE_ERROR: - } - mca_plm_submit_component.agent_path = - opal_path_findv(mca_plm_submit_component.agent_argv[0], X_OK, - environ, NULL); - if (NULL == mca_plm_submit_component.agent_path) { - *module = NULL; - return ORTE_ERROR: - } - *priority = mca_plm_submit_component.priority; - *module = (mca_base_module_t *) &orte_plm_submit_module; - return ORTE_SUCCESS; -} - - -int orte_plm_submit_component_close(void) -{ - /* cleanup state */ - OBJ_DESTRUCT(&mca_plm_submit_component.lock); - OBJ_DESTRUCT(&mca_plm_submit_component.cond); - if (NULL != mca_plm_submit_component.orted) { - free(mca_plm_submit_component.orted); - } - if (NULL != mca_plm_submit_component.agent_param) { - free(mca_plm_submit_component.agent_param); - } - if (NULL != mca_plm_submit_component.agent_argv) { - opal_argv_free(mca_plm_submit_component.agent_argv); - } - if (NULL != mca_plm_submit_component.agent_path) { - free(mca_plm_submit_component.agent_path); - } - return ORTE_SUCCESS; -} - - -/* - * Take a colon-delimited list of agents and locate the first one that - * we are able to find in the PATH. Split that one into argv and - * return it. If nothing found, then return NULL. - */ -static char **search(const char* agent_list) -{ - int i, j; - char *line, **lines = opal_argv_split(agent_list, ':'); - char **tokens, *tmp; - char cwd[PATH_MAX]; - - getcwd(cwd, PATH_MAX); - for (i = 0; NULL != lines[i]; ++i) { - line = lines[i]; - - /* Trim whitespace at the beginning and end of the line */ - for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) { - continue; - } - for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) { - line[j] = '\0'; - } - if (strlen(line) <= 0) { - continue; - } - - /* Split it */ - tokens = opal_argv_split(line, ' '); - - /* Look for the first token in the PATH */ - tmp = opal_path_findv(tokens[0], X_OK, environ, cwd); - if (NULL != tmp) { - free(tokens[0]); - tokens[0] = tmp; - opal_argv_free(lines); - return tokens; - } - - /* Didn't find it */ - opal_argv_free(tokens); - } - - /* Doh -- didn't find anything */ - opal_argv_free(lines); - return NULL; -} diff --git a/orte/mca/plm/submit/pls_submit_module.c b/orte/mca/plm/submit/pls_submit_module.c deleted file mode 100644 index e2f1f17527..0000000000 --- a/orte/mca/plm/submit/pls_submit_module.c +++ /dev/null @@ -1,993 +0,0 @@ -/* - * Copyright (c) 2004-2008 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#include -#ifdef HAVE_STRINGS_H -#include -#endif -#ifdef HAVE_SYS_SELECT_H -#include -#endif -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#ifdef HAVE_SYS_STAT_H -#include -#endif -#ifdef HAVE_SYS_WAIT_H -#include -#endif -#include -#include -#ifdef HAVE_PWD_H -#include -#endif - -#include "opal/mca/installdirs/installdirs.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/util/if.h" -#include "opal/util/output.h" -#include "opal/util/os_path.h" -#include "opal/util/path.h" -#include "opal/mca/event/event.h" -#include "opal/util/argv.h" -#include "opal/util/opal_environ.h" -#include "orte/util/show_help.h" -#include "opal/util/basename.h" -#include "opal/util/opal_environ.h" - -#include "orte/util/proc_info.h" -#include "orte/util/univ_info.h" - -#include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/params.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ras/ras_types.h" -#include "orte/mca/rmaps/rmaps.h" - -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/submit/plm_submit.h" - -static int orte_plm_submit_init(void); - -orte_plm_base_module_t orte_plm_submit_module = { - orte_plm_submit_init, - orte_plm_base_set_hnp_name, - orte_plm_submit_launch, - NULL, - orte_plm_base_orted_terminate_job, - orte_plm_submit_terminate_orteds, - orte_plm_base_orted_kill_local_procs, - orte_plm_submit_signal_job, - orte_plm_submit_finalize -}; - -static void set_handler_default(int sig); - -enum { - ORTE_PLM_submit_SHELL_BASH = 0, - ORTE_PLM_submit_SHELL_ZSH, - ORTE_PLM_submit_SHELL_TCSH, - ORTE_PLM_submit_SHELL_CSH, - ORTE_PLM_submit_SHELL_KSH, - ORTE_PLM_submit_SHELL_SH, - ORTE_PLM_submit_SHELL_UNKNOWN -}; - -typedef int orte_plm_submit_shell; - -static const char * orte_plm_submit_shell_name[] = { - "bash", - "zsh", - "tcsh", /* tcsh has to be first otherwise strstr finds csh */ - "csh", - "ksh", - "sh", - "unknown" -}; - -/* local global storage of timing variables */ -static struct timeval joblaunchstart, joblaunchstop; - -/* global storage of active jobid being launched */ -static orte_jobid_t active_job=ORTE_JOBID_INVALID; - -/* - * Init module - */ -static int orte_plm_submit_init(void) -{ - int rc; - - if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { - ORTE_ERROR_LOG(rc); - } - return rc; -} - -/** - * Check the Shell variable on the specified node - */ - -static int orte_plm_submit_probe(orte_node_t *node, orte_plm_submit_shell * shell) -{ - char ** argv; - int argc, rc = ORTE_SUCCESS, i; - int fd[2]; - pid_t pid; - char outbuf[4096]; - - if (mca_plm_submit_component.debug) { - opal_output(0, "plm:submit: going to check SHELL variable on node %s\n", - node->name); - } - *shell = ORTE_PLM_submit_SHELL_UNKNOWN; - if (pipe(fd)) { - opal_output(0, "plm:submit: pipe failed with errno=%d\n", errno); - return ORTE_ERR_IN_ERRNO; - } - if ((pid = fork()) < 0) { - opal_output(0, "plm:submit: fork failed with errno=%d\n", errno); - return ORTE_ERR_IN_ERRNO; - } - else if (pid == 0) { /* child */ - if (dup2(fd[1], 1) < 0) { - opal_output(0, "plm:submit: dup2 failed with errno=%d\n", errno); - exit(01); - } - /* Build argv array */ - argv = opal_argv_copy(mca_plm_submit_component.agent_argv); - argc = mca_plm_submit_component.agent_argc; - opal_argv_append(&argc, &argv, node->name); - opal_argv_append(&argc, &argv, "echo $SHELL"); - - execvp(argv[0], argv); - exit(errno); - } - if (close(fd[1])) { - opal_output(0, "plm:submit: close failed with errno=%d\n", errno); - return ORTE_ERR_IN_ERRNO; - } - - { - ssize_t ret = 1; - char* ptr = outbuf; - size_t outbufsize = sizeof(outbuf); - - do { - ret = read (fd[0], ptr, outbufsize-1); - if (ret < 0) { - if (errno == EINTR) - continue; - opal_output( 0, "Unable to detect the remote shell (error %s)\n", - strerror(errno) ); - rc = ORTE_ERR_IN_ERRNO; - break; - } - if( outbufsize > 1 ) { - outbufsize -= ret; - ptr += ret; - } - } while( 0 != ret ); - *ptr = '\0'; - } - close(fd[0]); - - if( outbuf[0] != '\0' ) { - char *sh_name = rindex(outbuf, '/'); - if( NULL != sh_name ) { - sh_name++; /* skip '/' */ - /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore - * we have to remove the "\n" */ - if ( sh_name[strlen(sh_name)-1] == '\n' ) { - sh_name[strlen(sh_name)-1] = '\0'; - } - /* Search for the substring of known shell-names */ - for (i = 0; i < (int)(sizeof (orte_plm_submit_shell_name)/ - sizeof(orte_plm_submit_shell_name[0])); i++) { - if ( 0 == strcmp(sh_name, orte_plm_submit_shell_name[i]) ) { - *shell = i; - break; - } - } - } - } - if (mca_plm_submit_component.debug) { - if( ORTE_PLM_submit_SHELL_UNKNOWN == *shell ) { - opal_output(0, "plm:submit: node:%s has unhandled SHELL\n", - node->name); - } else { - opal_output(0, "plm:submit: node:%s has SHELL: %s\n", - node->name, orte_plm_submit_shell_name[*shell]); - } - } - return rc; -} - -/** - * Fill the exec_path variable with the directory to the orted - */ - -static int orte_plm_submit_fill_exec_path ( char ** exec_path) -{ - struct stat buf; - - asprintf(exec_path, "%s/orted", opal_install_dirs.bindir); - if (0 != stat(*exec_path, &buf)) { - char *path = getenv("PATH"); - if (NULL == path) { - path = ("PATH is empty!"); - } - orte_show_help("help-plm-submit.txt", "no-local-orted", - true, path, opal_install_dirs.bindir); - return ORTE_ERR_NOT_FOUND; - } - return ORTE_SUCCESS; -} - -/** - * Callback on daemon exit. - */ - -static void orte_plm_submit_wait_daemon(pid_t pid, int status, void* cbdata) -{ - unsigned long deltat; - - if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { - /* tell the user something went wrong */ - opal_output(0, "ERROR: A daemon failed to start as expected."); - opal_output(0, "ERROR: There may be more information available from"); - opal_output(0, "ERROR: the remote shell (see above)."); - - if (WIFEXITED(status)) { - opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.", - WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { -#ifdef WCOREDUMP - if (WCOREDUMP(status)) { - opal_output(0, "The daemon received a signal %d (with core).", - WTERMSIG(status)); - } else { - opal_output(0, "The daemon received a signal %d.", WTERMSIG(status)); - } -#else - opal_output(0, "The daemon received a signal %d.", WTERMSIG(status)); -#endif /* WCOREDUMP */ - } else { - opal_output(0, "No extra status information is available: %d.", status); - } - /* The usual reasons for ssh to exit abnormally all are a pretty good - indication that the child processes aren't going to start up properly. - Set the job state to indicate we failed to launch so orterun's exit status - will be non-zero and forcibly terminate the job so orterun can exit - */ - orte_errmgr.update_state(active_job, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, 0, status); - - } /* if abnormal exit */ - - /* release any waiting threads */ - OPAL_THREAD_LOCK(&mca_plm_submit_component.lock); - - if (mca_plm_submit_component.num_children-- >= - mca_plm_submit_component.num_concurrent || - mca_plm_submit_component.num_children == 0) { - opal_condition_signal(&mca_plm_submit_component.cond); - } - - if (mca_plm_submit_component.timing && mca_plm_submit_component.num_children == 0) { - if (0 != gettimeofday(&joblaunchstop, NULL)) { - opal_output(0, "plm_submit: could not obtain job launch stop time"); - } else { - deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + - (joblaunchstop.tv_usec - joblaunchstart.tv_usec); - opal_output(0, "plm_submit: total time to launch job is %lu usec", deltat); - } - } - - OPAL_THREAD_UNLOCK(&mca_plm_submit_component.lock); - -} - -/** - * Launch a daemon (bootproxy) on each node. The daemon will be responsible - * for launching the application. - */ - -/* When working in this function, ALWAYS jump to "cleanup" if - * you encounter an error so that orterun will be woken up and - * the job can cleanly terminate - */ -int orte_plm_submit_launch(orte_job_t *jdata) -{ - orte_job_map_t *map; - orte_std_cntr_t num_nodes; - int node_name_index1; - int proc_vpid_index; - int local_exec_index, local_exec_index_end; - char *vpid_string = NULL; - char *param; - char **argv = NULL; - char *prefix_dir; - int argc; - int rc; - sigset_t sigs; - struct passwd *p; - bool remote_sh = false, remote_csh = false; - bool local_sh = false, local_csh = false; - char *lib_base = NULL, *bin_base = NULL; - bool failed_launch = true; - orte_app_context_t **apps; - orte_node_t **nodes; - orte_std_cntr_t nnode; - - if (mca_plm_submit_component.timing) { - if (0 != gettimeofday(&joblaunchstart, NULL)) { - opal_output(0, "plm_submit: could not obtain start time"); - joblaunchstart.tv_sec = 0; - joblaunchstart.tv_usec = 0; - } - } - - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:submit: launching job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - - /* set the active jobid */ - active_job = jobid; - - /* Get the map for this job */ - if (NULL == (map = orte_rmaps.get_job_map(active_job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - apps = (orte_app_context_t**)jdata->apps->addr; - nodes = (orte_node_t**)map->nodes->addr; - - /* account for any reuse of daemons */ - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_on_existing_daemons(map))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - num_nodes = map->num_new_daemons; - if (0 == num_nodes) { - /* have all the daemons we need - launch app */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:submit: no new daemons to launch", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; - } - - if (mca_plm_submit_component.debug_daemons && - mca_plm_submit_component.num_concurrent < num_nodes) { - /** - * If we are in '--debug-daemons' we keep the ssh connection - * alive for the span of the run. If we use this option - * AND we launch on more than "num_concurrent" machines - * then we will deadlock. No connections are terminated - * until the job is complete, no job is started - * since all the orteds are waiting for all the others - * to come online, and the others ore not launched because - * we are waiting on those that have started to terminate - * their ssh tunnels. :( - * As we cannot run in this situation, pretty print the error - * and return an error code. - */ - orte_show_help("help-plm-submit.txt", "deadlock-params", - true, mca_plm_submit_component.num_concurrent, num_nodes); - rc = ORTE_ERR_FATAL; - goto cleanup; - } - - /* - * After a discussion between Ralph & Jeff, we concluded that we - * really are handling the prefix dir option incorrectly. It currently - * is associated with an app_context, yet it really refers to the - * location where OpenRTE/Open MPI is installed on a NODE. Fixing - * this right now would involve significant change to orterun as well - * as elsewhere, so we will intentionally leave this incorrect at this - * point. The error, however, is identical to that seen in all prior - * releases of OpenRTE/Open MPI, so our behavior is no worse than before. - * - * A note to fix this, along with ideas on how to do so, has been filed - * on the project's Trac system under "feature enhancement". - * - * For now, default to the prefix_dir provided in the first app_context. - * Since there always MUST be at least one app_context, we are safe in - * doing this. - */ - prefix_dir = apps[0]->prefix_dir; - - /* What is our local shell? */ - p = getpwuid(getuid()); - if( NULL == p ) { - /* This user is unknown to the system. Therefore, there is no reason we - * spawn whatsoever in his name. Give up with a HUGE error message. - */ - orte_show_help( "help-plm-submit.txt", "unknown-user", true, (int)getuid() ); - rc = ORTE_ERR_FATAL; - goto cleanup; - } else { - int i; - char *sh_name = NULL; - - sh_name = rindex(p->pw_shell, '/'); - sh_name++; /* skip the '\' */ - for (i = 0; i < (int)(sizeof (orte_plm_submit_shell_name)/ - sizeof(orte_plm_submit_shell_name[0])); i++) { - if ( 0 == strcmp(sh_name, orte_plm_submit_shell_name[i]) ) { - switch (i) { - case ORTE_PLM_submit_SHELL_SH: /* fall through */ - case ORTE_PLM_submit_SHELL_KSH: /* fall through */ - case ORTE_PLM_submit_SHELL_ZSH: /* fall through */ - case ORTE_PLM_submit_SHELL_BASH: local_sh = true; break; - case ORTE_PLM_submit_SHELL_TCSH: /* fall through */ - case ORTE_PLM_submit_SHELL_CSH: local_csh = true; break; - /* The match has been done, there is no need for a default case here */ - } - /* I did match one of the known shells, so now we're done with the shell detection */ - break; - } - } - if ( i == ORTE_PLM_submit_SHELL_UNKNOWN ) { - opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", - sh_name); - local_sh = true; - } - - if (mca_plm_submit_component.debug) { - opal_output(0, "plm:submit: local csh: %d, local sh: %d\n", - local_csh, local_sh); - } - } - - /* What is our remote shell? */ - if (mca_plm_submit_component.assume_same_shell) { - remote_sh = local_sh; - remote_csh = local_csh; - if (mca_plm_submit_component.debug) { - opal_output(0, "plm:submit: assuming same remote shell as local shell"); - } - } else { - orte_plm_submit_shell shell; - rc = orte_plm_submit_probe(nodes[0], &shell); - - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - switch (shell) { - case ORTE_PLM_submit_SHELL_SH: /* fall through */ - case ORTE_PLM_submit_SHELL_KSH: /* fall through */ - case ORTE_PLM_submit_SHELL_BASH: remote_sh = true; break; - case ORTE_PLM_submit_SHELL_TCSH: /* fall through */ - case ORTE_PLM_submit_SHELL_CSH: remote_csh = true; break; - default: - opal_output(0, "WARNING: submit probe returned unhandled shell:%s assuming bash\n", - orte_plm_submit_shell_name[shell]); - remote_sh = true; - } - } - if (mca_plm_submit_component.debug) { - opal_output(0, "plm:submit: remote csh: %d, remote sh: %d\n", - remote_csh, remote_sh); - } - - /* - * Build argv array - */ - argv = opal_argv_copy(mca_plm_submit_component.agent_argv); - argc = mca_plm_submit_component.agent_argc; - node_name_index1 = argc; - opal_argv_append(&argc, &argv, "