From b475421c162846bbd29e495b3e40ecd79166aa37 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 26 Nov 2011 02:33:05 +0000 Subject: [PATCH] As promised, rationalize the rsh support. Remove rshbase and the base rsh support, centralizing all rsh support into the rsh component. Remove the "slave" launch support as that experiment is complete. Fix tree spawn and make that the default method for rsh launch, turning it "off" for qrsh as that system does not support tree spawn. This commit was SVN r25507. --- orte/config/config_files.m4 | 3 +- orte/mca/ess/slave/.windows | 12 - orte/mca/ess/slave/Makefile.am | 45 - orte/mca/ess/slave/configure.m4 | 21 - orte/mca/ess/slave/ess_slave.h | 36 - orte/mca/ess/slave/ess_slave_component.c | 85 - orte/mca/ess/slave/ess_slave_module.c | 534 ----- orte/mca/plm/base/Makefile.am | 6 +- orte/mca/plm/base/plm_base_close.c | 18 +- orte/mca/plm/base/plm_base_open.c | 46 +- orte/mca/plm/base/plm_base_rsh_support.c | 1835 ----------------- orte/mca/plm/base/plm_base_rsh_support.h | 89 - orte/mca/plm/base/plm_private.h | 24 +- orte/mca/plm/rsh/plm_rsh.h | 28 +- orte/mca/plm/rsh/plm_rsh_component.c | 144 +- orte/mca/plm/rsh/plm_rsh_module.c | 959 ++++----- orte/mca/plm/rshbase/Makefile.am | 46 - orte/mca/plm/rshbase/configure.m4 | 31 - orte/mca/plm/rshbase/help-plm-rshbase.txt | 77 - orte/mca/plm/rshbase/plm_rshbase.h | 58 - orte/mca/plm/rshbase/plm_rshbase_component.c | 162 -- orte/mca/plm/rshbase/plm_rshbase_module.c | 576 ------ orte/mca/plm/slurm/plm_slurm_module.c | 22 - orte/mca/plm/tm/plm_tm_module.c | 21 - orte/mca/routed/binomial/routed_binomial.c | 51 +- orte/mca/routed/cm/routed_cm.c | 28 +- orte/mca/routed/direct/routed_direct.c | 11 +- orte/mca/routed/linear/routed_linear.c | 55 +- orte/mca/routed/radix/routed_radix.c | 51 +- orte/mca/routed/routed.h | 3 +- orte/mca/routed/slave/Makefile.am | 37 - orte/mca/routed/slave/configure.m4 | 19 - orte/mca/routed/slave/routed_slave.c | 352 ---- orte/mca/routed/slave/routed_slave.h | 26 - .../mca/routed/slave/routed_slave_component.c | 55 - orte/orted/orted_comm.c | 21 +- orte/orted/orted_main.c | 111 +- orte/runtime/orte_finalize.c | 5 +- orte/runtime/orte_globals.c | 4 - orte/runtime/orte_globals.h | 4 - orte/runtime/orte_mca_params.c | 18 - orte/tools/Makefile.am | 4 +- orte/tools/orte-bootproxy/Makefile.am | 27 - orte/tools/orte-bootproxy/orte-bootproxy.sh | 91 - 44 files changed, 665 insertions(+), 5186 deletions(-) delete mode 100644 orte/mca/ess/slave/.windows delete mode 100644 orte/mca/ess/slave/Makefile.am delete mode 100644 orte/mca/ess/slave/configure.m4 delete mode 100644 orte/mca/ess/slave/ess_slave.h delete mode 100644 orte/mca/ess/slave/ess_slave_component.c delete mode 100644 orte/mca/ess/slave/ess_slave_module.c delete mode 100644 orte/mca/plm/base/plm_base_rsh_support.c delete mode 100644 orte/mca/plm/base/plm_base_rsh_support.h delete mode 100644 orte/mca/plm/rshbase/Makefile.am delete mode 100644 orte/mca/plm/rshbase/configure.m4 delete mode 100644 orte/mca/plm/rshbase/help-plm-rshbase.txt delete mode 100644 orte/mca/plm/rshbase/plm_rshbase.h delete mode 100644 orte/mca/plm/rshbase/plm_rshbase_component.c delete mode 100644 orte/mca/plm/rshbase/plm_rshbase_module.c delete mode 100644 orte/mca/routed/slave/Makefile.am delete mode 100644 orte/mca/routed/slave/configure.m4 delete mode 100644 orte/mca/routed/slave/routed_slave.c delete mode 100644 orte/mca/routed/slave/routed_slave.h delete mode 100644 orte/mca/routed/slave/routed_slave_component.c delete mode 100644 orte/tools/orte-bootproxy/Makefile.am delete mode 100755 orte/tools/orte-bootproxy/orte-bootproxy.sh diff --git a/orte/config/config_files.m4 b/orte/config/config_files.m4 index 64c87f9e08..e245d3bcd4 100644 --- a/orte/config/config_files.m4 +++ b/orte/config/config_files.m4 @@ -4,6 +4,8 @@ # Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -28,7 +30,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-ps/Makefile orte/tools/orte-clean/Makefile orte/tools/orte-top/Makefile - orte/tools/orte-bootproxy/Makefile orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile ]) diff --git a/orte/mca/ess/slave/.windows b/orte/mca/ess/slave/.windows deleted file mode 100644 index aa7d7bbbe5..0000000000 --- a/orte/mca/ess/slave/.windows +++ /dev/null @@ -1,12 +0,0 @@ -# -# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module -mca_link_libraries=libopen-rte diff --git a/orte/mca/ess/slave/Makefile.am b/orte/mca/ess/slave/Makefile.am deleted file mode 100644 index f770e575bb..0000000000 --- a/orte/mca/ess/slave/Makefile.am +++ /dev/null @@ -1,45 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -sources = \ - ess_slave.h \ - ess_slave_component.c \ - ess_slave_module.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_ess_slave_DSO -component_noinst = -component_install = mca_ess_slave.la -else -component_noinst = libmca_ess_slave.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_ess_slave_la_SOURCES = $(sources) -mca_ess_slave_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_ess_slave_la_SOURCES =$(sources) -libmca_ess_slave_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ess/slave/configure.m4 b/orte/mca/ess/slave/configure.m4 deleted file mode 100644 index af7d29504c..0000000000 --- a/orte/mca/ess/slave/configure.m4 +++ /dev/null @@ -1,21 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -AC_DEFUN([MCA_orte_ess_slave_PRIORITY], [10]) - -# MCA_ess_slave_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_ess_slave_CONFIG], [ - AC_CONFIG_FILES([orte/mca/ess/slave/Makefile]) - - AS_IF([test "$orte_without_full_support" = 0], - [$1], - [$2]) -]) diff --git a/orte/mca/ess/slave/ess_slave.h b/orte/mca/ess/slave/ess_slave.h deleted file mode 100644 index abe688c098..0000000000 --- a/orte/mca/ess/slave/ess_slave.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_ESS_SLAVE_H -#define ORTE_ESS_SLAVE_H - -BEGIN_C_DECLS - -/* - * Module open / close - */ -int orte_ess_slave_component_open(void); -int orte_ess_slave_component_close(void); -int orte_ess_slave_component_query(mca_base_module_t **module, int *priority); - - -ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_slave_component; - -END_C_DECLS - -#endif /* ORTE_ESS_SLAVE_H */ diff --git a/orte/mca/ess/slave/ess_slave_component.c b/orte/mca/ess/slave/ess_slave_component.c deleted file mode 100644 index 0ba91570e8..0000000000 --- a/orte/mca/ess/slave/ess_slave_component.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/mca_base_param.h" - - -#include "orte/mca/ess/ess.h" -#include "orte/mca/ess/slave/ess_slave.h" - -extern orte_ess_base_module_t orte_ess_slave_module; - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ -orte_ess_base_component_t mca_ess_slave_component = { - { - ORTE_ESS_BASE_VERSION_2_0_0, - - /* Component name and version */ - "slave", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_ess_slave_component_open, - orte_ess_slave_component_close, - orte_ess_slave_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - - -int -orte_ess_slave_component_open(void) -{ - return ORTE_SUCCESS; -} - -int orte_ess_slave_component_query(mca_base_module_t **module, int *priority) -{ - /* we are the slave module, so set the priority so - * we can only be selected if directed to do so - */ - - *priority = 0; - *module = (mca_base_module_t *)&orte_ess_slave_module; - return ORTE_SUCCESS; -} - - -int -orte_ess_slave_component_close(void) -{ - return ORTE_SUCCESS; -} - diff --git a/orte/mca/ess/slave/ess_slave_module.c b/orte/mca/ess/slave/ess_slave_module.c deleted file mode 100644 index def1346b31..0000000000 --- a/orte/mca/ess/slave/ess_slave_module.c +++ /dev/null @@ -1,534 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#ifdef HAVE_FCNTL_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "opal/mca/event/event.h" -#include "opal/runtime/opal.h" -#include "opal/mca/paffinity/paffinity.h" - -#include "orte/util/show_help.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/util/output.h" -#include "opal/util/malloc.h" - -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/base/base.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/grpcomm/base/base.h" -#include "orte/mca/iof/base/base.h" -#include "orte/mca/ess/base/base.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/ras/base/base.h" -#include "orte/mca/plm/base/base.h" - -#include "orte/mca/rmaps/base/base.h" -#if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/snapc/base/base.h" -#endif -#include "orte/mca/filem/base/base.h" -#include "orte/util/proc_info.h" -#include "orte/util/session_dir.h" -#include "orte/util/name_fns.h" -#include "orte/util/nidmap.h" - -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/runtime/orte_cr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/ess/base/base.h" -#include "orte/mca/ess/slave/ess_slave.h" - -static int slave_set_name(void); - -static int rte_init(void); -static int rte_finalize(void); -static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc); -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); -static char* proc_get_hostname(orte_process_name_t *proc); -static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); -static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); -static int update_pidmap(opal_byte_object_t *bo); -static int update_nidmap(opal_byte_object_t *bo); - -#if OPAL_ENABLE_FT_CR == 1 -static int rte_ft_event(int state); -static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t pid); -#endif - -orte_ess_base_module_t orte_ess_slave_module = { - rte_init, - rte_finalize, - orte_ess_base_app_abort, - proc_get_locality, - proc_get_daemon, - proc_get_hostname, - proc_get_local_rank, - proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ - update_pidmap, - update_nidmap, -#if OPAL_ENABLE_FT_CR == 1 - rte_ft_event -#else - NULL -#endif -}; - -static int rte_init(void) -{ - int ret; - char *error = NULL; - - /* run the prolog */ - if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { - error = "orte_ess_base_std_prolog"; - goto error; - } - - /* Start by getting a unique name from the enviro */ - slave_set_name(); - - /* use the default procedure to finish my setup */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } - - /* init my nidmap arrays - no data can be available, but - * we want to ensure that nobody else who looks at - * those arrays will segfault - */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - return ORTE_SUCCESS; - -error: - orte_show_help("help-orte-runtime.txt", - "orte_init:startup:internal-failure", - true, error, ORTE_ERROR_NAME(ret), ret); - - return ret; -} - -static int rte_finalize(void) -{ - int ret; - - /* use the default procedure to finish */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { - ORTE_ERROR_LOG(ret); - } - - /* deconstruct the nidmap and jobmap arrays */ - orte_util_nidmap_finalize(); - - return ret; -} - -static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc) -{ - /* no proc can be local */ - - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:slave: proc %s is REMOTE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - return OPAL_PROC_NON_LOCAL; - -} - -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - - /* if it is me, the answer is my daemon's vpid */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return ORTE_PROC_MY_DAEMON->vpid; - } - - /* otherwise, no idea */ - return ORTE_VPID_INVALID; -} - -static char* proc_get_hostname(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - /* if it is me, the answer is my nodename */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return orte_process_info.nodename; - } - - /* otherwise, no idea */ - return NULL; -} - -static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - /* if it is me, the local rank is zero */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return 0; - } - - /* otherwise, no idea */ - return ORTE_LOCAL_RANK_INVALID; -} - -static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) -{ - /* if it is me, the node rank is zero */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid && - proc->vpid == ORTE_PROC_MY_NAME->vpid) { - return 0; - } - - /* otherwise, no idea */ - return ORTE_NODE_RANK_INVALID; -} - -static int update_pidmap(opal_byte_object_t *bo) -{ - return ORTE_SUCCESS; -} - -static int update_nidmap(opal_byte_object_t *bo) -{ - return ORTE_SUCCESS; -} - -static int slave_set_name(void) -{ - char *jobid_str, *procid_str; - int id, rc; - orte_jobid_t jobid; - orte_vpid_t vpid; - - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_str); - if (NULL == jobid_str) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(jobid_str); - - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &procid_str); - if (NULL == procid_str) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(procid_str); - - ORTE_PROC_MY_NAME->jobid = jobid; - ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); - - OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, - "ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* get the non-name common environmental variables */ - if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -#if OPAL_ENABLE_FT_CR == 1 -static int rte_ft_event(int state) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_proc_type_t svtype; - - /******** Checkpoint Prep ********/ - if(OPAL_CRS_CHECKPOINT == state) { - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - } - /******** Continue Recovery ********/ - else if (OPAL_CRS_CONTINUE == state ) { - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * This should follow the ess init() function - */ - - /* - * Clear nidmap and jmap - */ - orte_util_nidmap_finalize(); - - /* - * - Reset Contact information - */ - if( ORTE_SUCCESS != (ret = slave_set_name() ) ) { - exit_status = ret; - } - - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Restart the routed framework - * JJH: Lie to the finalize function so it does not try to contact the daemon. - */ - svtype = orte_process_info.proc_type; - orte_process_info.proc_type = ORTE_PROC_TOOL; - if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { - exit_status = ret; - goto cleanup; - } - orte_process_info.proc_type = svtype; - if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { - exit_status = ret; - goto cleanup; - } - - /* - * Group Comm - Clean out stale data - */ - orte_grpcomm.finalize(); - if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { - exit_status = ret; - goto cleanup; - } - if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { - exit_status = ret; - goto cleanup; - } - - /* - * Restart the PLM - Does nothing at the moment, but included for completeness - */ - if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = orte_plm.init())) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * RML - Enable communications - */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { - exit_status = ret; - goto cleanup; - } - - /* - * Session directory re-init - */ - if (orte_create_session_dirs) { - if (ORTE_SUCCESS != (ret = orte_session_dir(true, - orte_process_info.tmpdir_base, - orte_process_info.nodename, - NULL, /* Batch ID -- Not used */ - ORTE_PROC_MY_NAME))) { - exit_status = ret; - } - - opal_output_set_output_file_info(orte_process_info.proc_session_dir, - "output-", NULL, NULL); - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Send new PID to HNP/daemon - * The checkpointer could have used a proxy program to boot us - * so the pid that the orted got from fork() may not be the - * PID of this application. - * - Note: BLCR does this because it tries to preseve the PID - * of the program across checkpointes - */ - if( ORTE_SUCCESS != (ret = ess_slave_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) { - exit_status = ret; - goto cleanup; - } - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if (OPAL_CRS_TERM == state ) { - /* Nothing */ - } - else { - /* Error state = Nothing */ - } - - cleanup: - - return exit_status; -} - -static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t proc_pid) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t buffer; - orte_snapc_cmd_flag_t command = ORTE_SNAPC_LOCAL_UPDATE_CMD; - - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_CMD )) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc_pid, 1, OPAL_PID))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_SNAPC, 0))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - cleanup: - OBJ_DESTRUCT(&buffer); - - return exit_status; -} -#endif - diff --git a/orte/mca/plm/base/Makefile.am b/orte/mca/plm/base/Makefile.am index 1dc6aa0144..e7584a28e5 100644 --- a/orte/mca/plm/base/Makefile.am +++ b/orte/mca/plm/base/Makefile.am @@ -28,8 +28,7 @@ if !ORTE_DISABLE_FULL_SUPPORT dist_pkgdata_DATA += base/help-plm-base.txt headers += \ - base/plm_private.h \ - base/plm_base_rsh_support.h + base/plm_private.h libmca_plm_la_SOURCES += \ base/plm_base_close.c \ @@ -38,6 +37,5 @@ libmca_plm_la_SOURCES += \ base/plm_base_launch_support.c \ base/plm_base_jobid.c \ base/plm_base_proxy.c \ - base/plm_base_orted_cmds.c \ - base/plm_base_rsh_support.c + base/plm_base_orted_cmds.c endif diff --git a/orte/mca/plm/base/plm_base_close.c b/orte/mca/plm/base/plm_base_close.c index f53c759e76..68337c5043 100644 --- a/orte/mca/plm/base/plm_base_close.c +++ b/orte/mca/plm/base/plm_base_close.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,7 +32,6 @@ #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/base/plm_base_rsh_support.h" int orte_plm_base_finalize(void) { @@ -66,21 +67,6 @@ int orte_plm_base_close(void) OBJ_DESTRUCT(&orte_plm_globals.spawn_lock); OBJ_DESTRUCT(&orte_plm_globals.spawn_cond); -#ifndef __WINDOWS__ - /* clearout the rsh support */ - orte_plm_base_local_slave_finalize(); -#endif - - /* remove the rsh agent info */ - if (NULL != orte_plm_globals.rsh_agent_argv) { - opal_argv_free(orte_plm_globals.rsh_agent_argv); - } - if (NULL != orte_plm_globals.rsh_agent_path) { - free(orte_plm_globals.rsh_agent_path); - } - - OBJ_DESTRUCT(&orte_plm_globals.slave_files); - /* Close all open components */ mca_base_components_close(orte_plm_globals.output, &orte_plm_base.available_components, NULL); diff --git a/orte/mca/plm/base/plm_base_open.c b/orte/mca/plm/base/plm_base_open.c index 549f33489c..b560f407e4 100644 --- a/orte/mca/plm/base/plm_base_open.c +++ b/orte/mca/plm/base/plm_base_open.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,44 +57,6 @@ int orte_plm_base_open(void) #else -static void slave_file_construct(orte_slave_files_t *ptr) -{ - ptr->node = NULL; - ptr->local = false; - ptr->prefix = NULL; - ptr->bootproxy = NULL; - ptr->positioned = false; - OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t); - opal_pointer_array_init(&ptr->apps, 8, 1024, 8); - OBJ_CONSTRUCT(&ptr->files, opal_pointer_array_t); - opal_pointer_array_init(&ptr->files, 8, 1024, 8); -} -static void slave_file_destruct(orte_slave_files_t *ptr) -{ - int i; - char *cptr; - - if (NULL != ptr->node) free(ptr->node); - if (NULL != ptr->prefix) free(ptr->prefix); - if (NULL != ptr->bootproxy) free(ptr->bootproxy); - for (i=0; i < ptr->apps.size; i++) { - if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->apps, i))) { - free(cptr); - } - } - OBJ_DESTRUCT(&ptr->apps); - for (i=0; i < ptr->files.size; i++) { - if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->files, i))) { - free(cptr); - } - } - OBJ_DESTRUCT(&ptr->files); -} -OBJ_CLASS_INSTANCE(orte_slave_files_t, - opal_list_item_t, - slave_file_construct, - slave_file_destruct); - /* * Global public variables */ @@ -146,12 +110,6 @@ int orte_plm_base_open(void) /* init the next jobid */ orte_plm_globals.next_jobid = 1; - /* init the rsh support */ - orte_plm_globals.rsh_agent_argv = NULL; - orte_plm_globals.rsh_agent_path = NULL; - orte_plm_globals.local_slaves = 0; - OBJ_CONSTRUCT(&orte_plm_globals.slave_files, opal_list_t); - /* Open up all the components that we can find */ if (ORTE_SUCCESS != diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c deleted file mode 100644 index bfe83962cf..0000000000 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ /dev/null @@ -1,1835 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#ifdef HAVE_STRING_H -#include -#endif -#ifdef HAVE_STRINGS_H -#include -#endif -#ifdef HAVE_SYS_TIME_H -#include -#endif -#include -#include -#include -#ifdef HAVE_PWD_H -#include -#endif -#include - -#include "opal/mca/installdirs/installdirs.h" -#include "opal/util/os_path.h" -#include "opal/util/output.h" -#include "opal/util/os_dirpath.h" -#include "opal/util/path.h" -#include "opal/util/argv.h" -#include "opal/util/basename.h" -#include "opal/util/opal_environ.h" -#include "opal/util/if.h" - -#include "opal/dss/dss.h" - -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_wait.h" -#include "orte/util/name_fns.h" -#include "orte/util/dash_host/dash_host.h" - -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/base/plm_base_rsh_support.h" - -/* These strings *must* follow the same order as the enum ORTE_PLM_RSH_SHELL_* */ -const char *orte_plm_rsh_shell_name[7] = { - "bash", - "zsh", - "tcsh", /* tcsh has to be first otherwise strstr finds csh */ - "csh", - "ksh", - "sh", - "unknown" -}; - - -#ifndef __WINDOWS__ -static char **search(const char* agent_list, const char *path); - -int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path) -{ - char **tmp; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:rsh_lookup on agent %s path %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == agent_list) ? orte_rsh_agent : agent_list, - (NULL == path) ? "NULL" : path)); - if (NULL == (tmp = search(agent_list, path))) { - return ORTE_ERR_NOT_FOUND; - } - - /* if we got here, then one of the given agents could be found */ - opal_argv_free(tmp); - return ORTE_SUCCESS; -} - -int orte_plm_base_rsh_launch_agent_setup(const char *agent, char *path) -{ - char *bname; - int i; - - /* if no agent was provided, then report not found */ - if (NULL == orte_rsh_agent && NULL == agent) { - return ORTE_ERR_NOT_FOUND; - } - - /* search for the argv */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:rsh_setup on agent %s path %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == agent) ? orte_rsh_agent : agent, - (NULL == path) ? "NULL" : path)); - orte_plm_globals.rsh_agent_argv = search(agent, path); - - if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) { - /* nothing was found */ - return ORTE_ERR_NOT_FOUND; - } - - /* see if we can find the agent in the path */ - orte_plm_globals.rsh_agent_path = - opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, path); - - if (NULL == orte_plm_globals.rsh_agent_path) { - /* not an error - just report not found */ - opal_argv_free(orte_plm_globals.rsh_agent_argv); - return ORTE_ERR_NOT_FOUND; - } - - bname = opal_basename(orte_plm_globals.rsh_agent_argv[0]); - if (NULL != bname && 0 == strcmp(bname, "ssh")) { - /* if xterm option was given, add '-X', ensuring we don't do it twice */ - if (NULL != orte_xterm) { - opal_argv_append_unique_nosize(&orte_plm_globals.rsh_agent_argv, "-X", false); - } else if (0 >= opal_output_get_verbosity(orte_plm_globals.output)) { - /* if debug was not specified, and the user didn't explicitly - * specify X11 forwarding/non-forwarding, add "-x" if it - * isn't already there (check either case) - */ - for (i = 1; NULL != orte_plm_globals.rsh_agent_argv[i]; ++i) { - if (0 == strcasecmp("-x", - orte_plm_globals.rsh_agent_argv[i])) { - break; - } - } - if (NULL == orte_plm_globals.rsh_agent_argv[i]) { - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-x"); - } - } - } - - /* the caller can append any additional argv's they desire */ - return ORTE_SUCCESS; -} - -/**** SLAVE LAUNCH SUPPORT ****/ - -static bool ack_recvd; - -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - -static void recv_ack(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); -} - -static void set_handler_default(int sig) -{ - struct sigaction act; - - act.sa_handler = SIG_DFL; - act.sa_flags = 0; - sigemptyset(&act.sa_mask); - - sigaction(sig, &act, (struct sigaction *)0); -} - -int orte_plm_base_local_slave_launch(orte_job_t *jdata) -{ - char **argv; - opal_list_t hosts; - orte_node_t *node; - char *nodename; - char *exec_path; - orte_app_context_t *app; - int rc; - pid_t pid; - long fd, fdmax = sysconf(_SC_OPEN_MAX); - sigset_t sigs; - - /* point to the apps array */ - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* increment the local slave jobid */ - orte_plm_globals.local_slaves++; - - /* identify the target host - can only be one! */ - OBJ_CONSTRUCT(&hosts, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, app->dash_host))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&hosts); - return rc; - } - if (1 < opal_list_get_size(&hosts)) { - orte_show_help("help-plm-base.txt", "too-many-hosts", true, (int)opal_list_get_size(&hosts)); - return ORTE_ERROR; - } - node = (orte_node_t*)opal_list_remove_first(&hosts); - nodename = strdup(node->name); - OBJ_RELEASE(node); - OBJ_DESTRUCT(&hosts); - - /* set the jobid in jdata so the caller knows what it is */ - jdata->jobid = orte_plm_globals.local_slaves; - - /* setup the launch */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_slave_launch(nodename, app, - "orte-bootproxy.sh", - &argv, &exec_path))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* add the bootproxy cmd line options */ - if (ORTE_SUCCESS != (rc = orte_plm_base_append_bootproxy_args(app, &argv, - jdata->jobid, 0, /* jobid, vpid */ - 1, 1, /* #nodes, #procs */ - 0, 0, /* nrank, lrank */ - 1, 1, /* #local, #slots */ - true))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* fork a child to exec the rsh/ssh session */ - pid = fork(); - if (pid < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - return ORTE_ERR_SYS_LIMITS_CHILDREN; - } - - /* child */ - if (pid == 0) { - /* close all file descriptors w/ exception of stdin/stdout/stderr */ - for(fd=3; fdcontrols & ORTE_JOB_CONTROL_NON_ORTE_JOB)) { - ack_recvd = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, - ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); - /* to release this job from the wait in plm_base_receive, we have to - * flag it as having reported - */ - jdata->num_reported = jdata->num_procs; - } - - /* cleanup */ - free(exec_path); - opal_argv_free(argv); - } - - return ORTE_SUCCESS; -} - -/* - * Take a colon-delimited list of agents and locate the first one that - * we are able to find in the PATH. Split that one into argv and - * return it. If nothing found, then return NULL. - */ -static char **search(const char* agent_list, const char *path) -{ - int i, j; - char *line, **lines; - char **tokens, *tmp; - char cwd[OPAL_PATH_MAX]; - - if (NULL == path) { - getcwd(cwd, OPAL_PATH_MAX); - } else { - strncpy(cwd, path, OPAL_PATH_MAX); - } - if (NULL == agent_list) { - lines = opal_argv_split(orte_rsh_agent, ':'); - } else { - lines = opal_argv_split(agent_list, ':'); - } - for (i = 0; NULL != lines[i]; ++i) { - line = lines[i]; - - /* Trim whitespace at the beginning and end of the line */ - for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) { - continue; - } - for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) { - line[j] = '\0'; - } - if (strlen(line) <= 0) { - continue; - } - - /* Split it */ - tokens = opal_argv_split(line, ' '); - - /* Look for the first token in the PATH */ - tmp = opal_path_findv(tokens[0], X_OK, environ, cwd); - if (NULL != tmp) { - free(tokens[0]); - tokens[0] = tmp; - opal_argv_free(lines); - return tokens; - } - - /* Didn't find it */ - opal_argv_free(tokens); - } - - /* Doh -- didn't find anything */ - opal_argv_free(lines); - return NULL; -} - -void orte_plm_base_local_slave_finalize(void) -{ - opal_list_item_t *item; - orte_slave_files_t *slave_node; - char *cmd, *filenm, **argv; - int i; - bool first; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - while (NULL != (item = opal_list_remove_first(&orte_plm_globals.slave_files))) { - slave_node = (orte_slave_files_t*)item; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - entry for node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->node)); - - /* we will use the bootproxy cmd script to clean up for us. All we - * have to do is tell it to run in CLEANUP mode, and then tell it - * the APPS and FILES it needs to cleanup - */ - - if (slave_node->local) { - /* setup the bootproxy cmd */ - argv = NULL; - opal_argv_append_nosize(&argv, slave_node->bootproxy); - } else { - /* Start the argv with the rsh/ssh command */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(&argv, slave_node->node); - /* add the bootproxy cmd */ - opal_argv_append_nosize(&argv, slave_node->bootproxy); - } - /* pass the CLEANUP mode */ - opal_argv_append_nosize(&argv, "CLEANUP"); - /* pass the name of the apps running on the node - the bootproxy will - * send a TERM signal to each of them - */ - first = true; - for (i=0; i < slave_node->apps.size; i++) { - if (NULL == (filenm = opal_pointer_array_get_item(&slave_node->apps, i))) { - continue; - } - if (first) { - opal_argv_append_nosize(&argv, "APPS"); - first = false; - } - opal_argv_append_nosize(&argv, filenm); - } - /* remove any files we positioned */ - first = true; - for (i=0; i < slave_node->files.size; i++) { - if (NULL == (filenm = opal_pointer_array_get_item(&slave_node->files, i))) { - continue; - } - if (first) { - opal_argv_append_nosize(&argv, "FILES"); - first = false; - } - opal_argv_append_nosize(&argv, filenm); - } - /* execute the cmd */ - cmd = opal_argv_join(argv, ' '); - opal_argv_free(argv); - argv = NULL; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - removing files with cmd:\n\t%s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd)); - system(cmd); - free(cmd); - /* now remove the bootproxy itself, if needed */ - if (slave_node->positioned) { - if (slave_node->local) { - asprintf(&cmd, "rm -f %s", slave_node->bootproxy); - } else { - /* Start the argv with the rsh/ssh command */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(&argv, slave_node->node); - /* add the rm cmd */ - opal_argv_append_nosize(&argv, "rm -f"); - /* add the bootproxy file */ - opal_argv_append_nosize(&argv, slave_node->bootproxy); - /* form the cmd */ - cmd = opal_argv_join(argv, ' '); - opal_argv_free(argv); - argv = NULL; - } - /* execute it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - removing bootproxy with cmd:\n\t%s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd)); - system(cmd); - free(cmd); - } - OBJ_RELEASE(item); - } -} - -static orte_plm_rsh_shell_t find_shell(char *shell) -{ - int i = 0; - char *sh_name = NULL; - - if( (NULL == shell) || (strlen(shell) == 1) ) { - /* Malformed shell */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; - } - - sh_name = rindex(shell, '/'); - if( NULL == sh_name ) { - /* Malformed shell */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; - } - - /* skip the '/' */ - ++sh_name; - for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name) / - sizeof(orte_plm_rsh_shell_name[0])); ++i) { - if (0 == strcmp(sh_name, orte_plm_rsh_shell_name[i])) { - return (orte_plm_rsh_shell_t)i; - } - } - - /* We didn't find it */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; -} - -/** - * Check the Shell variable on the specified node - */ - -int orte_plm_base_rsh_shell_probe(char *nodename, orte_plm_rsh_shell_t *shell) -{ - char ** argv; - int argc, rc = ORTE_SUCCESS, i; - int fd[2]; - pid_t pid; - char outbuf[4096]; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: going to check SHELL variable on node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodename)); - - *shell = ORTE_PLM_RSH_SHELL_UNKNOWN; - if (pipe(fd)) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: pipe failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - if ((pid = fork()) < 0) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: fork failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - else if (pid == 0) { /* child */ - if (dup2(fd[1], 1) < 0) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: dup2 failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - exit(01); - } - /* Build argv array */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - argc = opal_argv_count(orte_plm_globals.rsh_agent_argv); - opal_argv_append(&argc, &argv, nodename); - opal_argv_append(&argc, &argv, "echo $SHELL"); - - execvp(argv[0], argv); - exit(errno); - } - if (close(fd[1])) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: close failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - - { - ssize_t ret = 1; - char* ptr = outbuf; - size_t outbufsize = sizeof(outbuf); - - do { - ret = read (fd[0], ptr, outbufsize-1); - if (ret < 0) { - if (errno == EINTR) - continue; - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: Unable to detect the remote shell (error %s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - strerror(errno))); - rc = ORTE_ERR_IN_ERRNO; - break; - } - if( outbufsize > 1 ) { - outbufsize -= ret; - ptr += ret; - } - } while( 0 != ret ); - *ptr = '\0'; - } - close(fd[0]); - - if( outbuf[0] != '\0' ) { - char *sh_name = rindex(outbuf, '/'); - if( NULL != sh_name ) { - sh_name++; /* skip '/' */ - /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore - * we have to remove the "\n" */ - if ( sh_name[strlen(sh_name)-1] == '\n' ) { - sh_name[strlen(sh_name)-1] = '\0'; - } - /* Search for the substring of known shell-names */ - for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name)/ - sizeof(orte_plm_rsh_shell_name[0])); i++) { - if ( 0 == strcmp(sh_name, orte_plm_rsh_shell_name[i]) ) { - *shell = (orte_plm_rsh_shell_t)i; - break; - } - } - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: node %s has SHELL: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodename, - (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : (char*)orte_plm_rsh_shell_name[*shell])); - - return rc; -} - -int orte_plm_base_rsh_setup_shell(orte_plm_rsh_shell_t *rshell, - orte_plm_rsh_shell_t *lshell, - char *nodename, char ***argv) -{ - orte_plm_rsh_shell_t remote_shell, local_shell; - struct passwd *p; - char *param; - int rc; - - /* What is our local shell? */ - local_shell = ORTE_PLM_RSH_SHELL_UNKNOWN; - p = getpwuid(getuid()); - if( NULL == p ) { - /* This user is unknown to the system. Therefore, there is no reason we - * spawn whatsoever in his name. Give up with a HUGE error message. - */ - orte_show_help( "help-plm-rshd.txt", "unknown-user", true, (int)getuid() ); - return ORTE_ERR_FATAL; - } - param = p->pw_shell; - local_shell = find_shell(p->pw_shell); - - /* If we didn't find it in getpwuid(), try looking at the $SHELL - environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) - */ - if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell && - NULL != (param = getenv("SHELL"))) { - local_shell = find_shell(param); - } - - if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell) { - opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", - (NULL != param) ? param : "unknown"); - local_shell = ORTE_PLM_RSH_SHELL_BASH; - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: local shell: %d (%s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - local_shell, orte_plm_rsh_shell_name[local_shell])); - - /* What is our remote shell? */ - if (orte_assume_same_shell) { - remote_shell = local_shell; - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: assuming same remote shell as local shell", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - rc = orte_plm_base_rsh_shell_probe(nodename, &remote_shell); - - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_PLM_RSH_SHELL_UNKNOWN == remote_shell) { - opal_output(0, "WARNING: shell probe returned unhandled shell; assuming bash\n"); - remote_shell = ORTE_PLM_RSH_SHELL_BASH; - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: remote shell: %d (%s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - remote_shell, orte_plm_rsh_shell_name[remote_shell])); - - /* Do we need to source .profile on the remote side? - - sh: yes (see bash(1)) - - ksh: yes (see ksh(1)) - - bash: no (see bash(1)) - - [t]csh: no (see csh(1) and tcsh(1)) - - zsh: no (see http://zsh.sourceforge.net/FAQ/zshfaq03.html#l19) - */ - - if (ORTE_PLM_RSH_SHELL_SH == remote_shell || - ORTE_PLM_RSH_SHELL_KSH == remote_shell) { - int i; - char **tmp; - tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' '); - if (NULL == tmp) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - for (i = 0; NULL != tmp[i]; ++i) { - opal_argv_append_nosize(argv, tmp[i]); - } - opal_argv_free(tmp); - } - - /* pass results back */ - *rshell = remote_shell; - *lshell = local_shell; - - return ORTE_SUCCESS; -} - -int orte_plm_base_setup_slave_launch(char *nodename, orte_app_context_t *app, - char *rcmd, char ***argv, char **exec_path) -{ - orte_slave_files_t *slave_node, *tst_node; - opal_list_item_t *item; - char *bootproxy, *cmd, *scp=NULL; - char *exefile=NULL, *basename, *path=NULL; - char *tmp, *dest, *dest_dir, *filenm; - char **files; - char cwd[OPAL_PATH_MAX]; - int rc, i, j; - char *lib_base, *bin_base; - orte_plm_rsh_shell_t rshell, lshell; - char **tmpargv=NULL; - char *opal_prefix; - - /* set default */ - *exec_path = NULL; - *argv = NULL; - - /* Figure out the basenames for the libdir and bindir. This - requires some explanation: - - - Use opal_install_dirs.libdir and opal_install_dirs.bindir. - - - After a discussion on the devel-core mailing list, the - developers decided that we should use the local directory - basenames as the basis for the prefix on the remote note. - This does not handle a few notable cases (e.g., if the - libdir/bindir is not simply a subdir under the prefix, if the - libdir/bindir basename is not the same on the remote node as - it is here on the local node, etc.), but we decided that - --prefix was meant to handle "the common case". If you need - something more complex than this, a) edit your shell startup - files to set PATH/LD_LIBRARY_PATH properly on the remove - node, or b) use some new/to-be-defined options that - explicitly allow setting the bindir/libdir on the remote - node. We decided to implement these options (e.g., - --remote-bindir and --remote-libdir) to orterun when it - actually becomes a problem for someone (vs. a hypothetical - situation). - - Hence, for now, we simply take the basename of this install's - libdir and bindir and use it to append this install's prefix - and use that on the remote node. - */ - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - opal_prefix = getenv("OPAL_PREFIX"); - - /* have we launched anything on this node before? */ - slave_node = NULL; - for (item = opal_list_get_first(&orte_plm_globals.slave_files); - item != opal_list_get_end(&orte_plm_globals.slave_files); - item = opal_list_get_next(item)) { - tst_node = (orte_slave_files_t*)item; - if (0 == strcmp(tst_node->node, nodename)) { - slave_node = tst_node; - break; - } - } - if (NULL == slave_node) { - slave_node = OBJ_NEW(orte_slave_files_t); - slave_node->node = strdup(nodename); - /* save the bootproxy cmd */ - slave_node->bootproxy = strdup(rcmd); - /* is this a local operation? */ - if (0 == strcmp(orte_process_info.nodename, nodename) || - 0 == strcmp(nodename, "localhost") || - opal_ifislocal(nodename)) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: node %s is local", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - slave_node->local = true; - /* use the prefix, if given */ - if (NULL != app->prefix_dir) { - asprintf(&slave_node->prefix, "%s/%s", app->prefix_dir, bin_base); - } else { - /* use our install dirs */ - slave_node->prefix = strdup(opal_install_dirs.bindir); - } - /* no need to preposition the remote cmd, and no need to remove it */ - slave_node->positioned = false; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: setting prefix to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->prefix)); - } else { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: node %s is remote", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - /* setup the correct shell info */ - if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_setup_shell(&rshell, &lshell, - nodename, &tmpargv))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(slave_node); - return rc; - } - /* we now need to assemble the actual cmd that will be executed - this depends - * upon whether or not a prefix directory is being used - */ - if (NULL != app->prefix_dir) { - /* if we have a prefix directory, we need to set the PATH and - * LD_LIBRARY_PATH on the remote node, and prepend the eventual cmd - * with the prefix directory - */ - if (ORTE_PLM_RSH_SHELL_SH == rshell || - ORTE_PLM_RSH_SHELL_KSH == rshell || - ORTE_PLM_RSH_SHELL_ZSH == rshell || - ORTE_PLM_RSH_SHELL_BASH == rshell) { - asprintf (&slave_node->prefix, - "%s%s%s PATH=%s/%s:$PATH ; export PATH ; " - "LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; " - "%s/%s", - (opal_prefix != NULL ? "OPAL_PREFIX=" : ""), - (opal_prefix != NULL ? opal_prefix : ""), - (opal_prefix != NULL ? " ; export OPAL_PREFIX;" : ""), - app->prefix_dir, bin_base, - app->prefix_dir, lib_base, - app->prefix_dir, bin_base); - } else if (ORTE_PLM_RSH_SHELL_TCSH == rshell || - ORTE_PLM_RSH_SHELL_CSH == rshell) { - /* [t]csh is a bit more challenging -- we - have to check whether LD_LIBRARY_PATH - is already set before we try to set it. - Must be very careful about obeying - [t]csh's order of evaluation and not - using a variable before it is defined. - See this thread for more details: - http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ - asprintf (&slave_node->prefix, - "%s%s%s set path = ( %s/%s $path ) ; " - "if ( $?LD_LIBRARY_PATH == 1 ) " - "set OMPI_have_llp ; " - "if ( $?LD_LIBRARY_PATH == 0 ) " - "setenv LD_LIBRARY_PATH %s/%s ; " - "if ( $?OMPI_have_llp == 1 ) " - "setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; " - "%s/%s", - (opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""), - (opal_prefix != NULL ? opal_prefix : ""), - (opal_prefix != NULL ? " ;" : ""), - app->prefix_dir, bin_base, - app->prefix_dir, lib_base, - app->prefix_dir, lib_base, - app->prefix_dir, bin_base); - } else { - orte_show_help("help-plm-rshd.txt", "cannot-resolve-shell-with-prefix", true, - (NULL == opal_prefix) ? "NULL" : opal_prefix, - app->prefix_dir); - return ORTE_ERR_SILENT; - } - /* since we have a prefix, we don't need to preposition the bootproxy - * or remove it later - */ - slave_node->positioned = false; - } else if (NULL != app->preload_files_dest_dir) { - /* the prefix will be the same as the preload destination */ - slave_node->prefix = strdup(app->preload_files_dest_dir); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } else if (NULL != orte_process_info.tmpdir_base) { - /* use the tmpdir base */ - slave_node->prefix = strdup(orte_process_info.tmpdir_base); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } else { - /* we have to preposition somewhere - default to /tmp */ - slave_node->prefix = strdup("/tmp"); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } - - /* do we need to preload the bootproxy on this node? */ - if (slave_node->positioned) { - /* find the local bootproxy */ - bootproxy = opal_find_absolute_path(rcmd); - if (NULL == bootproxy) { - orte_show_help("help-plm-base.txt", "bootproxy-not-found", true, rcmd); - return ORTE_ERR_NOT_FOUND; - } - path = opal_os_path(false, slave_node->prefix, rcmd, NULL); - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - /* form and execute the scp command */ - asprintf(&cmd, "%s %s %s:%s", scp, bootproxy, nodename, path); - system(cmd); - free(cmd); - free(path); - free(bootproxy); - } - } - /* add this node to our list */ - opal_list_append(&orte_plm_globals.slave_files, &slave_node->super); - } - - /* if we are going to position the binary or files, did they give us a dest? */ - if (NULL != app->preload_files_dest_dir) { - /* the target location -must- be an absolute path */ - if (!opal_path_is_absolute(app->preload_files_dest_dir)) { - orte_show_help("help-plm-base.txt", "abs-path-reqd", true, app->preload_files_dest_dir); - return ORTE_ERROR; - } - dest_dir = app->preload_files_dest_dir; - /* if this is a local op, make sure this location exists. we can't - * do this for remote ops as there is no way to create a remote - * directory - */ - if (slave_node->local) { - if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(dest_dir, S_IRWXU))) { - orte_show_help("help-plm-base.txt", "path-not-created", true, dest_dir); - return rc; - } - } - } else if (NULL != orte_process_info.tmpdir_base) { - /* put everything in the tmpdir base */ - dest_dir = orte_process_info.tmpdir_base; - } else { - /* put everything in /tmp */ - dest_dir = "/tmp"; - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: destination dir set to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dest_dir)); - - /* setup the exec_path to the bootproxy */ - if (slave_node->local) { - /* if this is a local operation, then just set - * the exec_path to be the bootproxy - */ - *argv = NULL; - asprintf(exec_path, "%s/%s", slave_node->prefix, rcmd); - opal_argv_append_nosize(argv, *exec_path); - } else { - /* set the exec path to the rsh agent path */ - *exec_path = strdup(orte_plm_globals.rsh_agent_path); - /* Start the argv with the rsh/ssh command */ - *argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(argv, nodename); - /* add the bootproxy cmd */ - if (NULL != slave_node->prefix) { - asprintf(&tmp, "%s/%s", slave_node->prefix, rcmd); - } else { - tmp = strdup(rcmd); - } - opal_argv_append_nosize(argv, tmp); - free(tmp); - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: exec_path set to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *exec_path)); - - /* do we need to preload the binary? */ - if (app->preload_binary) { - char * src; - /* if the binary is not given in absolute path form, - * then convert it to one - */ - if (!opal_path_is_absolute(app->app)) { - /* see if a source directory was given */ - if (NULL!= app->preload_files_src_dir) { - /* prepend the src dir to the executable name */ - path = opal_os_path(false, app->preload_files_src_dir, app->app, NULL); - /* now check for the existence of the app */ - src = opal_find_absolute_path(path); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, path); - return ORTE_ERROR; - } - } else { - /* look for it in the cwd */ - getcwd(cwd, OPAL_PATH_MAX); - src = opal_path_access(app->app, cwd, X_OK); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, cwd); - return ORTE_ERROR; - } - } - } else { - src = opal_path_access(app->app, NULL, X_OK); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, app->app); - return ORTE_ERROR; - } - } - /* get the basename */ - basename = opal_basename(app->app); - - /* define the destination */ - dest = opal_os_path(false, dest_dir, basename, NULL); - - /* - * We do not test for error after opal_basename -- this is fine, as opal_os_path - * is taking a NULL terminated list -- in case of error, well dest_dir is the final dir. - * However, we need to free basename here, before overwriting the pointer later. - */ - if (basename != NULL) { - free(basename); - } - - /* has this binary already been positioned? */ - for (i=0; i < slave_node->apps.size; i++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->apps, i)) && - 0 == strcmp(filenm, dest)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: app %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto PRELOAD_FILES; - } - } - /* add the app to the slave_node list */ - opal_pointer_array_add(&slave_node->apps, strdup(dest)); - /* since we are positioning the binary, add it to the list - * of files to be cleaned up when done - */ - opal_pointer_array_add(&slave_node->files, strdup(dest)); - - /* if this is a local node, then we just use the cp command */ - if (slave_node->local) { - scp = opal_find_absolute_path("cp"); - if (NULL == scp) { - free (src); - orte_show_help("help-plm-base.txt", "cp-not-found", true, "cp", "cp"); - return ORTE_ERROR; - } - /* form and execute the cp commands */ - asprintf(&cmd, "%s %s %s", scp, src, dest); - system(cmd); - free(cmd); - } else { - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - free (src); - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - /* form and execute the scp commands */ - asprintf(&cmd, "%s %s %s:%s", scp, src, nodename, dest); - system(cmd); - free(cmd); - } - free(src); - free(dest); - free(scp); - } else { - /* we don't need to pre-position the binary, but we do need - * to check if we should record it - */ - for (i=0; i < slave_node->apps.size; i++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->apps, i)) && - 0 == strcmp(filenm, app->app)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: app %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto PRELOAD_FILES; - } - } - /* add the app to the slave_node list */ - opal_pointer_array_add(&slave_node->apps, strdup(app->app)); - /* do not add it to the files to be cleaned up when done as - * we are not positioning it! - */ - } - -PRELOAD_FILES: - /* do we need to pre-position supporting files? */ - if (NULL != app->preload_files) { - if (slave_node->local) { - scp = opal_find_absolute_path("cp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "cp", "cp"); - return ORTE_ERROR; - } - } else { - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - } - /* break apart the comma-separated list of files */ - files = opal_argv_split(app->preload_files, ','); - /* copy each file across */ - for (i=0; i < opal_argv_count(files); i++) { - /* if the file is not given in absolute path form, - * then convert it to one - */ - if (!opal_path_is_absolute(files[i])) { - /* see if a source directory was given */ - if (NULL!= app->preload_files_src_dir) { - /* look for the file there */ - exefile = opal_path_access(files[i], app->preload_files_src_dir, R_OK); - } else { - /* look for it in the cwd */ - getcwd(cwd, OPAL_PATH_MAX); - exefile = opal_path_access(files[i], cwd, R_OK); - } - } else { - exefile = opal_path_access(files[i], NULL, R_OK); - } - if (NULL == exefile) { - getcwd(cwd, OPAL_PATH_MAX); - orte_show_help("help-plm-base.txt", "file-not-found", true, files[i], - (NULL == app->preload_files_src_dir) ? cwd : app->preload_files_src_dir); - return ORTE_ERROR; - } - /* define the destination */ - dest = opal_os_path(false, dest_dir, files[i], NULL); - /* has this file already been positioned? */ - for (j=0; j < slave_node->files.size; j++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->files, j)) && - 0 == strcmp(filenm, dest)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: file %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto SKIP; - } - } - /* add the file to the slave_node list */ - opal_pointer_array_add(&slave_node->files, strdup(dest)); - if (slave_node->local) { - /* form and execute the cp command */ - asprintf(&cmd, "%s %s %s", scp, exefile, dest); - system(cmd); - free(cmd); - } else { - /* form and execute the scp commands */ - asprintf(&cmd, "%s -q %s %s:%s", scp, exefile, nodename, dest); - system(cmd); - free(cmd); - } - SKIP: - free(exefile); - free(dest); - } - opal_argv_free(files); - free(scp); - } - - return ORTE_SUCCESS; -} - -int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr, - char *nodename, - int *node_name_index1, - int *proc_vpid_index, char *prefix_dir, - char *nodes) -{ - int argc; - char **argv; - char *param; - orte_plm_rsh_shell_t remote_shell, local_shell; - char *lib_base, *bin_base; - int orted_argc; - char **orted_argv; - char *orted_cmd, *orted_prefix, *final_cmd; - int orted_index; - int rc; - - - /* Figure out the basenames for the libdir and bindir. This - requires some explanation: - - - Use opal_install_dirs.libdir and opal_install_dirs.bindir. - - - After a discussion on the devel-core mailing list, the - developers decided that we should use the local directory - basenames as the basis for the prefix on the remote note. - This does not handle a few notable cases (e.g., if the - libdir/bindir is not simply a subdir under the prefix, if the - libdir/bindir basename is not the same on the remote node as - it is here on the local node, etc.), but we decided that - --prefix was meant to handle "the common case". If you need - something more complex than this, a) edit your shell startup - files to set PATH/LD_LIBRARY_PATH properly on the remove - node, or b) use some new/to-be-defined options that - explicitly allow setting the bindir/libdir on the remote - node. We decided to implement these options (e.g., - --remote-bindir and --remote-libdir) to orterun when it - actually becomes a problem for someone (vs. a hypothetical - situation). - - Hence, for now, we simply take the basename of this install's - libdir and bindir and use it to append this install's prefix - and use that on the remote node. - */ - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - - /* - * Build argv array - */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - argc = opal_argv_count(orte_plm_globals.rsh_agent_argv); - *node_name_index1 = argc; - opal_argv_append(&argc, &argv, "