diff --git a/orte/config/config_files.m4 b/orte/config/config_files.m4 index 64c87f9e08..e245d3bcd4 100644 --- a/orte/config/config_files.m4 +++ b/orte/config/config_files.m4 @@ -4,6 +4,8 @@ # Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -28,7 +30,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-ps/Makefile orte/tools/orte-clean/Makefile orte/tools/orte-top/Makefile - orte/tools/orte-bootproxy/Makefile orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile ]) diff --git a/orte/mca/ess/slave/.windows b/orte/mca/ess/slave/.windows deleted file mode 100644 index aa7d7bbbe5..0000000000 --- a/orte/mca/ess/slave/.windows +++ /dev/null @@ -1,12 +0,0 @@ -# -# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module -mca_link_libraries=libopen-rte diff --git a/orte/mca/ess/slave/Makefile.am b/orte/mca/ess/slave/Makefile.am deleted file mode 100644 index f770e575bb..0000000000 --- a/orte/mca/ess/slave/Makefile.am +++ /dev/null @@ -1,45 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -sources = \ - ess_slave.h \ - ess_slave_component.c \ - ess_slave_module.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_ess_slave_DSO -component_noinst = -component_install = mca_ess_slave.la -else -component_noinst = libmca_ess_slave.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_ess_slave_la_SOURCES = $(sources) -mca_ess_slave_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_ess_slave_la_SOURCES =$(sources) -libmca_ess_slave_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ess/slave/configure.m4 b/orte/mca/ess/slave/configure.m4 deleted file mode 100644 index af7d29504c..0000000000 --- a/orte/mca/ess/slave/configure.m4 +++ /dev/null @@ -1,21 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -AC_DEFUN([MCA_orte_ess_slave_PRIORITY], [10]) - -# MCA_ess_slave_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_ess_slave_CONFIG], [ - AC_CONFIG_FILES([orte/mca/ess/slave/Makefile]) - - AS_IF([test "$orte_without_full_support" = 0], - [$1], - [$2]) -]) diff --git a/orte/mca/ess/slave/ess_slave.h b/orte/mca/ess/slave/ess_slave.h deleted file mode 100644 index abe688c098..0000000000 --- a/orte/mca/ess/slave/ess_slave.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_ESS_SLAVE_H -#define ORTE_ESS_SLAVE_H - -BEGIN_C_DECLS - -/* - * Module open / close - */ -int orte_ess_slave_component_open(void); -int orte_ess_slave_component_close(void); -int orte_ess_slave_component_query(mca_base_module_t **module, int *priority); - - -ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_slave_component; - -END_C_DECLS - -#endif /* ORTE_ESS_SLAVE_H */ diff --git a/orte/mca/ess/slave/ess_slave_component.c b/orte/mca/ess/slave/ess_slave_component.c deleted file mode 100644 index 0ba91570e8..0000000000 --- a/orte/mca/ess/slave/ess_slave_component.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/mca_base_param.h" - - -#include "orte/mca/ess/ess.h" -#include "orte/mca/ess/slave/ess_slave.h" - -extern orte_ess_base_module_t orte_ess_slave_module; - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ -orte_ess_base_component_t mca_ess_slave_component = { - { - ORTE_ESS_BASE_VERSION_2_0_0, - - /* Component name and version */ - "slave", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_ess_slave_component_open, - orte_ess_slave_component_close, - orte_ess_slave_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - - -int -orte_ess_slave_component_open(void) -{ - return ORTE_SUCCESS; -} - -int orte_ess_slave_component_query(mca_base_module_t **module, int *priority) -{ - /* we are the slave module, so set the priority so - * we can only be selected if directed to do so - */ - - *priority = 0; - *module = (mca_base_module_t *)&orte_ess_slave_module; - return ORTE_SUCCESS; -} - - -int -orte_ess_slave_component_close(void) -{ - return ORTE_SUCCESS; -} - diff --git a/orte/mca/ess/slave/ess_slave_module.c b/orte/mca/ess/slave/ess_slave_module.c deleted file mode 100644 index def1346b31..0000000000 --- a/orte/mca/ess/slave/ess_slave_module.c +++ /dev/null @@ -1,534 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#ifdef HAVE_FCNTL_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "opal/mca/event/event.h" -#include "opal/runtime/opal.h" -#include "opal/mca/paffinity/paffinity.h" - -#include "orte/util/show_help.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/util/output.h" -#include "opal/util/malloc.h" - -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/base/base.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/grpcomm/base/base.h" -#include "orte/mca/iof/base/base.h" -#include "orte/mca/ess/base/base.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/ras/base/base.h" -#include "orte/mca/plm/base/base.h" - -#include "orte/mca/rmaps/base/base.h" -#if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/snapc/base/base.h" -#endif -#include "orte/mca/filem/base/base.h" -#include "orte/util/proc_info.h" -#include "orte/util/session_dir.h" -#include "orte/util/name_fns.h" -#include "orte/util/nidmap.h" - -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/runtime/orte_cr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/ess/base/base.h" -#include "orte/mca/ess/slave/ess_slave.h" - -static int slave_set_name(void); - -static int rte_init(void); -static int rte_finalize(void); -static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc); -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); -static char* proc_get_hostname(orte_process_name_t *proc); -static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); -static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); -static int update_pidmap(opal_byte_object_t *bo); -static int update_nidmap(opal_byte_object_t *bo); - -#if OPAL_ENABLE_FT_CR == 1 -static int rte_ft_event(int state); -static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t pid); -#endif - -orte_ess_base_module_t orte_ess_slave_module = { - rte_init, - rte_finalize, - orte_ess_base_app_abort, - proc_get_locality, - proc_get_daemon, - proc_get_hostname, - proc_get_local_rank, - proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ - update_pidmap, - update_nidmap, -#if OPAL_ENABLE_FT_CR == 1 - rte_ft_event -#else - NULL -#endif -}; - -static int rte_init(void) -{ - int ret; - char *error = NULL; - - /* run the prolog */ - if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { - error = "orte_ess_base_std_prolog"; - goto error; - } - - /* Start by getting a unique name from the enviro */ - slave_set_name(); - - /* use the default procedure to finish my setup */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } - - /* init my nidmap arrays - no data can be available, but - * we want to ensure that nobody else who looks at - * those arrays will segfault - */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - return ORTE_SUCCESS; - -error: - orte_show_help("help-orte-runtime.txt", - "orte_init:startup:internal-failure", - true, error, ORTE_ERROR_NAME(ret), ret); - - return ret; -} - -static int rte_finalize(void) -{ - int ret; - - /* use the default procedure to finish */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { - ORTE_ERROR_LOG(ret); - } - - /* deconstruct the nidmap and jobmap arrays */ - orte_util_nidmap_finalize(); - - return ret; -} - -static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc) -{ - /* no proc can be local */ - - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:slave: proc %s is REMOTE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - return OPAL_PROC_NON_LOCAL; - -} - -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - - /* if it is me, the answer is my daemon's vpid */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return ORTE_PROC_MY_DAEMON->vpid; - } - - /* otherwise, no idea */ - return ORTE_VPID_INVALID; -} - -static char* proc_get_hostname(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - /* if it is me, the answer is my nodename */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return orte_process_info.nodename; - } - - /* otherwise, no idea */ - return NULL; -} - -static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc) -{ - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - /* if it is me, the local rank is zero */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) { - return 0; - } - - /* otherwise, no idea */ - return ORTE_LOCAL_RANK_INVALID; -} - -static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) -{ - /* if it is me, the node rank is zero */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid && - proc->vpid == ORTE_PROC_MY_NAME->vpid) { - return 0; - } - - /* otherwise, no idea */ - return ORTE_NODE_RANK_INVALID; -} - -static int update_pidmap(opal_byte_object_t *bo) -{ - return ORTE_SUCCESS; -} - -static int update_nidmap(opal_byte_object_t *bo) -{ - return ORTE_SUCCESS; -} - -static int slave_set_name(void) -{ - char *jobid_str, *procid_str; - int id, rc; - orte_jobid_t jobid; - orte_vpid_t vpid; - - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_str); - if (NULL == jobid_str) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(jobid_str); - - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &procid_str); - if (NULL == procid_str) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(procid_str); - - ORTE_PROC_MY_NAME->jobid = jobid; - ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); - - OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, - "ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* get the non-name common environmental variables */ - if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -#if OPAL_ENABLE_FT_CR == 1 -static int rte_ft_event(int state) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_proc_type_t svtype; - - /******** Checkpoint Prep ********/ - if(OPAL_CRS_CHECKPOINT == state) { - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { - exit_status = ret; - goto cleanup; - } - } - /******** Continue Recovery ********/ - else if (OPAL_CRS_CONTINUE == state ) { - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { - exit_status = ret; - goto cleanup; - } - } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * This should follow the ess init() function - */ - - /* - * Clear nidmap and jmap - */ - orte_util_nidmap_finalize(); - - /* - * - Reset Contact information - */ - if( ORTE_SUCCESS != (ret = slave_set_name() ) ) { - exit_status = ret; - } - - /* - * Notify RML -> OOB - */ - if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Restart the routed framework - * JJH: Lie to the finalize function so it does not try to contact the daemon. - */ - svtype = orte_process_info.proc_type; - orte_process_info.proc_type = ORTE_PROC_TOOL; - if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { - exit_status = ret; - goto cleanup; - } - orte_process_info.proc_type = svtype; - if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { - exit_status = ret; - goto cleanup; - } - - /* - * Group Comm - Clean out stale data - */ - orte_grpcomm.finalize(); - if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { - exit_status = ret; - goto cleanup; - } - if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { - exit_status = ret; - goto cleanup; - } - - /* - * Restart the PLM - Does nothing at the moment, but included for completeness - */ - if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = orte_plm.init())) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * RML - Enable communications - */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { - exit_status = ret; - goto cleanup; - } - - /* - * Session directory re-init - */ - if (orte_create_session_dirs) { - if (ORTE_SUCCESS != (ret = orte_session_dir(true, - orte_process_info.tmpdir_base, - orte_process_info.nodename, - NULL, /* Batch ID -- Not used */ - ORTE_PROC_MY_NAME))) { - exit_status = ret; - } - - opal_output_set_output_file_info(orte_process_info.proc_session_dir, - "output-", NULL, NULL); - } - - /* - * Notify Routed - */ - if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Notify SnapC - */ - if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { - exit_status = ret; - goto cleanup; - } - - /* - * Send new PID to HNP/daemon - * The checkpointer could have used a proxy program to boot us - * so the pid that the orted got from fork() may not be the - * PID of this application. - * - Note: BLCR does this because it tries to preseve the PID - * of the program across checkpointes - */ - if( ORTE_SUCCESS != (ret = ess_slave_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) { - exit_status = ret; - goto cleanup; - } - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if (OPAL_CRS_TERM == state ) { - /* Nothing */ - } - else { - /* Error state = Nothing */ - } - - cleanup: - - return exit_status; -} - -static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t proc_pid) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t buffer; - orte_snapc_cmd_flag_t command = ORTE_SNAPC_LOCAL_UPDATE_CMD; - - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_CMD )) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc_pid, 1, OPAL_PID))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_SNAPC, 0))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - cleanup: - OBJ_DESTRUCT(&buffer); - - return exit_status; -} -#endif - diff --git a/orte/mca/plm/base/Makefile.am b/orte/mca/plm/base/Makefile.am index 1dc6aa0144..e7584a28e5 100644 --- a/orte/mca/plm/base/Makefile.am +++ b/orte/mca/plm/base/Makefile.am @@ -28,8 +28,7 @@ if !ORTE_DISABLE_FULL_SUPPORT dist_pkgdata_DATA += base/help-plm-base.txt headers += \ - base/plm_private.h \ - base/plm_base_rsh_support.h + base/plm_private.h libmca_plm_la_SOURCES += \ base/plm_base_close.c \ @@ -38,6 +37,5 @@ libmca_plm_la_SOURCES += \ base/plm_base_launch_support.c \ base/plm_base_jobid.c \ base/plm_base_proxy.c \ - base/plm_base_orted_cmds.c \ - base/plm_base_rsh_support.c + base/plm_base_orted_cmds.c endif diff --git a/orte/mca/plm/base/plm_base_close.c b/orte/mca/plm/base/plm_base_close.c index f53c759e76..68337c5043 100644 --- a/orte/mca/plm/base/plm_base_close.c +++ b/orte/mca/plm/base/plm_base_close.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,7 +32,6 @@ #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/base/plm_base_rsh_support.h" int orte_plm_base_finalize(void) { @@ -66,21 +67,6 @@ int orte_plm_base_close(void) OBJ_DESTRUCT(&orte_plm_globals.spawn_lock); OBJ_DESTRUCT(&orte_plm_globals.spawn_cond); -#ifndef __WINDOWS__ - /* clearout the rsh support */ - orte_plm_base_local_slave_finalize(); -#endif - - /* remove the rsh agent info */ - if (NULL != orte_plm_globals.rsh_agent_argv) { - opal_argv_free(orte_plm_globals.rsh_agent_argv); - } - if (NULL != orte_plm_globals.rsh_agent_path) { - free(orte_plm_globals.rsh_agent_path); - } - - OBJ_DESTRUCT(&orte_plm_globals.slave_files); - /* Close all open components */ mca_base_components_close(orte_plm_globals.output, &orte_plm_base.available_components, NULL); diff --git a/orte/mca/plm/base/plm_base_open.c b/orte/mca/plm/base/plm_base_open.c index 549f33489c..b560f407e4 100644 --- a/orte/mca/plm/base/plm_base_open.c +++ b/orte/mca/plm/base/plm_base_open.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,44 +57,6 @@ int orte_plm_base_open(void) #else -static void slave_file_construct(orte_slave_files_t *ptr) -{ - ptr->node = NULL; - ptr->local = false; - ptr->prefix = NULL; - ptr->bootproxy = NULL; - ptr->positioned = false; - OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t); - opal_pointer_array_init(&ptr->apps, 8, 1024, 8); - OBJ_CONSTRUCT(&ptr->files, opal_pointer_array_t); - opal_pointer_array_init(&ptr->files, 8, 1024, 8); -} -static void slave_file_destruct(orte_slave_files_t *ptr) -{ - int i; - char *cptr; - - if (NULL != ptr->node) free(ptr->node); - if (NULL != ptr->prefix) free(ptr->prefix); - if (NULL != ptr->bootproxy) free(ptr->bootproxy); - for (i=0; i < ptr->apps.size; i++) { - if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->apps, i))) { - free(cptr); - } - } - OBJ_DESTRUCT(&ptr->apps); - for (i=0; i < ptr->files.size; i++) { - if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->files, i))) { - free(cptr); - } - } - OBJ_DESTRUCT(&ptr->files); -} -OBJ_CLASS_INSTANCE(orte_slave_files_t, - opal_list_item_t, - slave_file_construct, - slave_file_destruct); - /* * Global public variables */ @@ -146,12 +110,6 @@ int orte_plm_base_open(void) /* init the next jobid */ orte_plm_globals.next_jobid = 1; - /* init the rsh support */ - orte_plm_globals.rsh_agent_argv = NULL; - orte_plm_globals.rsh_agent_path = NULL; - orte_plm_globals.local_slaves = 0; - OBJ_CONSTRUCT(&orte_plm_globals.slave_files, opal_list_t); - /* Open up all the components that we can find */ if (ORTE_SUCCESS != diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c deleted file mode 100644 index bfe83962cf..0000000000 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ /dev/null @@ -1,1835 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#ifdef HAVE_STRING_H -#include -#endif -#ifdef HAVE_STRINGS_H -#include -#endif -#ifdef HAVE_SYS_TIME_H -#include -#endif -#include -#include -#include -#ifdef HAVE_PWD_H -#include -#endif -#include - -#include "opal/mca/installdirs/installdirs.h" -#include "opal/util/os_path.h" -#include "opal/util/output.h" -#include "opal/util/os_dirpath.h" -#include "opal/util/path.h" -#include "opal/util/argv.h" -#include "opal/util/basename.h" -#include "opal/util/opal_environ.h" -#include "opal/util/if.h" - -#include "opal/dss/dss.h" - -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_wait.h" -#include "orte/util/name_fns.h" -#include "orte/util/dash_host/dash_host.h" - -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/base/plm_base_rsh_support.h" - -/* These strings *must* follow the same order as the enum ORTE_PLM_RSH_SHELL_* */ -const char *orte_plm_rsh_shell_name[7] = { - "bash", - "zsh", - "tcsh", /* tcsh has to be first otherwise strstr finds csh */ - "csh", - "ksh", - "sh", - "unknown" -}; - - -#ifndef __WINDOWS__ -static char **search(const char* agent_list, const char *path); - -int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path) -{ - char **tmp; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:rsh_lookup on agent %s path %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == agent_list) ? orte_rsh_agent : agent_list, - (NULL == path) ? "NULL" : path)); - if (NULL == (tmp = search(agent_list, path))) { - return ORTE_ERR_NOT_FOUND; - } - - /* if we got here, then one of the given agents could be found */ - opal_argv_free(tmp); - return ORTE_SUCCESS; -} - -int orte_plm_base_rsh_launch_agent_setup(const char *agent, char *path) -{ - char *bname; - int i; - - /* if no agent was provided, then report not found */ - if (NULL == orte_rsh_agent && NULL == agent) { - return ORTE_ERR_NOT_FOUND; - } - - /* search for the argv */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:rsh_setup on agent %s path %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == agent) ? orte_rsh_agent : agent, - (NULL == path) ? "NULL" : path)); - orte_plm_globals.rsh_agent_argv = search(agent, path); - - if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) { - /* nothing was found */ - return ORTE_ERR_NOT_FOUND; - } - - /* see if we can find the agent in the path */ - orte_plm_globals.rsh_agent_path = - opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK, - environ, path); - - if (NULL == orte_plm_globals.rsh_agent_path) { - /* not an error - just report not found */ - opal_argv_free(orte_plm_globals.rsh_agent_argv); - return ORTE_ERR_NOT_FOUND; - } - - bname = opal_basename(orte_plm_globals.rsh_agent_argv[0]); - if (NULL != bname && 0 == strcmp(bname, "ssh")) { - /* if xterm option was given, add '-X', ensuring we don't do it twice */ - if (NULL != orte_xterm) { - opal_argv_append_unique_nosize(&orte_plm_globals.rsh_agent_argv, "-X", false); - } else if (0 >= opal_output_get_verbosity(orte_plm_globals.output)) { - /* if debug was not specified, and the user didn't explicitly - * specify X11 forwarding/non-forwarding, add "-x" if it - * isn't already there (check either case) - */ - for (i = 1; NULL != orte_plm_globals.rsh_agent_argv[i]; ++i) { - if (0 == strcasecmp("-x", - orte_plm_globals.rsh_agent_argv[i])) { - break; - } - } - if (NULL == orte_plm_globals.rsh_agent_argv[i]) { - opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-x"); - } - } - } - - /* the caller can append any additional argv's they desire */ - return ORTE_SUCCESS; -} - -/**** SLAVE LAUNCH SUPPORT ****/ - -static bool ack_recvd; - -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - -static void recv_ack(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); -} - -static void set_handler_default(int sig) -{ - struct sigaction act; - - act.sa_handler = SIG_DFL; - act.sa_flags = 0; - sigemptyset(&act.sa_mask); - - sigaction(sig, &act, (struct sigaction *)0); -} - -int orte_plm_base_local_slave_launch(orte_job_t *jdata) -{ - char **argv; - opal_list_t hosts; - orte_node_t *node; - char *nodename; - char *exec_path; - orte_app_context_t *app; - int rc; - pid_t pid; - long fd, fdmax = sysconf(_SC_OPEN_MAX); - sigset_t sigs; - - /* point to the apps array */ - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* increment the local slave jobid */ - orte_plm_globals.local_slaves++; - - /* identify the target host - can only be one! */ - OBJ_CONSTRUCT(&hosts, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, app->dash_host))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&hosts); - return rc; - } - if (1 < opal_list_get_size(&hosts)) { - orte_show_help("help-plm-base.txt", "too-many-hosts", true, (int)opal_list_get_size(&hosts)); - return ORTE_ERROR; - } - node = (orte_node_t*)opal_list_remove_first(&hosts); - nodename = strdup(node->name); - OBJ_RELEASE(node); - OBJ_DESTRUCT(&hosts); - - /* set the jobid in jdata so the caller knows what it is */ - jdata->jobid = orte_plm_globals.local_slaves; - - /* setup the launch */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_slave_launch(nodename, app, - "orte-bootproxy.sh", - &argv, &exec_path))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* add the bootproxy cmd line options */ - if (ORTE_SUCCESS != (rc = orte_plm_base_append_bootproxy_args(app, &argv, - jdata->jobid, 0, /* jobid, vpid */ - 1, 1, /* #nodes, #procs */ - 0, 0, /* nrank, lrank */ - 1, 1, /* #local, #slots */ - true))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* fork a child to exec the rsh/ssh session */ - pid = fork(); - if (pid < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - return ORTE_ERR_SYS_LIMITS_CHILDREN; - } - - /* child */ - if (pid == 0) { - /* close all file descriptors w/ exception of stdin/stdout/stderr */ - for(fd=3; fdcontrols & ORTE_JOB_CONTROL_NON_ORTE_JOB)) { - ack_recvd = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, - ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); - /* to release this job from the wait in plm_base_receive, we have to - * flag it as having reported - */ - jdata->num_reported = jdata->num_procs; - } - - /* cleanup */ - free(exec_path); - opal_argv_free(argv); - } - - return ORTE_SUCCESS; -} - -/* - * Take a colon-delimited list of agents and locate the first one that - * we are able to find in the PATH. Split that one into argv and - * return it. If nothing found, then return NULL. - */ -static char **search(const char* agent_list, const char *path) -{ - int i, j; - char *line, **lines; - char **tokens, *tmp; - char cwd[OPAL_PATH_MAX]; - - if (NULL == path) { - getcwd(cwd, OPAL_PATH_MAX); - } else { - strncpy(cwd, path, OPAL_PATH_MAX); - } - if (NULL == agent_list) { - lines = opal_argv_split(orte_rsh_agent, ':'); - } else { - lines = opal_argv_split(agent_list, ':'); - } - for (i = 0; NULL != lines[i]; ++i) { - line = lines[i]; - - /* Trim whitespace at the beginning and end of the line */ - for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) { - continue; - } - for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) { - line[j] = '\0'; - } - if (strlen(line) <= 0) { - continue; - } - - /* Split it */ - tokens = opal_argv_split(line, ' '); - - /* Look for the first token in the PATH */ - tmp = opal_path_findv(tokens[0], X_OK, environ, cwd); - if (NULL != tmp) { - free(tokens[0]); - tokens[0] = tmp; - opal_argv_free(lines); - return tokens; - } - - /* Didn't find it */ - opal_argv_free(tokens); - } - - /* Doh -- didn't find anything */ - opal_argv_free(lines); - return NULL; -} - -void orte_plm_base_local_slave_finalize(void) -{ - opal_list_item_t *item; - orte_slave_files_t *slave_node; - char *cmd, *filenm, **argv; - int i; - bool first; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - while (NULL != (item = opal_list_remove_first(&orte_plm_globals.slave_files))) { - slave_node = (orte_slave_files_t*)item; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - entry for node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->node)); - - /* we will use the bootproxy cmd script to clean up for us. All we - * have to do is tell it to run in CLEANUP mode, and then tell it - * the APPS and FILES it needs to cleanup - */ - - if (slave_node->local) { - /* setup the bootproxy cmd */ - argv = NULL; - opal_argv_append_nosize(&argv, slave_node->bootproxy); - } else { - /* Start the argv with the rsh/ssh command */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(&argv, slave_node->node); - /* add the bootproxy cmd */ - opal_argv_append_nosize(&argv, slave_node->bootproxy); - } - /* pass the CLEANUP mode */ - opal_argv_append_nosize(&argv, "CLEANUP"); - /* pass the name of the apps running on the node - the bootproxy will - * send a TERM signal to each of them - */ - first = true; - for (i=0; i < slave_node->apps.size; i++) { - if (NULL == (filenm = opal_pointer_array_get_item(&slave_node->apps, i))) { - continue; - } - if (first) { - opal_argv_append_nosize(&argv, "APPS"); - first = false; - } - opal_argv_append_nosize(&argv, filenm); - } - /* remove any files we positioned */ - first = true; - for (i=0; i < slave_node->files.size; i++) { - if (NULL == (filenm = opal_pointer_array_get_item(&slave_node->files, i))) { - continue; - } - if (first) { - opal_argv_append_nosize(&argv, "FILES"); - first = false; - } - opal_argv_append_nosize(&argv, filenm); - } - /* execute the cmd */ - cmd = opal_argv_join(argv, ' '); - opal_argv_free(argv); - argv = NULL; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - removing files with cmd:\n\t%s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd)); - system(cmd); - free(cmd); - /* now remove the bootproxy itself, if needed */ - if (slave_node->positioned) { - if (slave_node->local) { - asprintf(&cmd, "rm -f %s", slave_node->bootproxy); - } else { - /* Start the argv with the rsh/ssh command */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(&argv, slave_node->node); - /* add the rm cmd */ - opal_argv_append_nosize(&argv, "rm -f"); - /* add the bootproxy file */ - opal_argv_append_nosize(&argv, slave_node->bootproxy); - /* form the cmd */ - cmd = opal_argv_join(argv, ' '); - opal_argv_free(argv); - argv = NULL; - } - /* execute it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave:finalize - removing bootproxy with cmd:\n\t%s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd)); - system(cmd); - free(cmd); - } - OBJ_RELEASE(item); - } -} - -static orte_plm_rsh_shell_t find_shell(char *shell) -{ - int i = 0; - char *sh_name = NULL; - - if( (NULL == shell) || (strlen(shell) == 1) ) { - /* Malformed shell */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; - } - - sh_name = rindex(shell, '/'); - if( NULL == sh_name ) { - /* Malformed shell */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; - } - - /* skip the '/' */ - ++sh_name; - for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name) / - sizeof(orte_plm_rsh_shell_name[0])); ++i) { - if (0 == strcmp(sh_name, orte_plm_rsh_shell_name[i])) { - return (orte_plm_rsh_shell_t)i; - } - } - - /* We didn't find it */ - return ORTE_PLM_RSH_SHELL_UNKNOWN; -} - -/** - * Check the Shell variable on the specified node - */ - -int orte_plm_base_rsh_shell_probe(char *nodename, orte_plm_rsh_shell_t *shell) -{ - char ** argv; - int argc, rc = ORTE_SUCCESS, i; - int fd[2]; - pid_t pid; - char outbuf[4096]; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: going to check SHELL variable on node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodename)); - - *shell = ORTE_PLM_RSH_SHELL_UNKNOWN; - if (pipe(fd)) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: pipe failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - if ((pid = fork()) < 0) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: fork failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - else if (pid == 0) { /* child */ - if (dup2(fd[1], 1) < 0) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: dup2 failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - exit(01); - } - /* Build argv array */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - argc = opal_argv_count(orte_plm_globals.rsh_agent_argv); - opal_argv_append(&argc, &argv, nodename); - opal_argv_append(&argc, &argv, "echo $SHELL"); - - execvp(argv[0], argv); - exit(errno); - } - if (close(fd[1])) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: close failed with errno=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errno)); - return ORTE_ERR_IN_ERRNO; - } - - { - ssize_t ret = 1; - char* ptr = outbuf; - size_t outbufsize = sizeof(outbuf); - - do { - ret = read (fd[0], ptr, outbufsize-1); - if (ret < 0) { - if (errno == EINTR) - continue; - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: Unable to detect the remote shell (error %s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - strerror(errno))); - rc = ORTE_ERR_IN_ERRNO; - break; - } - if( outbufsize > 1 ) { - outbufsize -= ret; - ptr += ret; - } - } while( 0 != ret ); - *ptr = '\0'; - } - close(fd[0]); - - if( outbuf[0] != '\0' ) { - char *sh_name = rindex(outbuf, '/'); - if( NULL != sh_name ) { - sh_name++; /* skip '/' */ - /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore - * we have to remove the "\n" */ - if ( sh_name[strlen(sh_name)-1] == '\n' ) { - sh_name[strlen(sh_name)-1] = '\0'; - } - /* Search for the substring of known shell-names */ - for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name)/ - sizeof(orte_plm_rsh_shell_name[0])); i++) { - if ( 0 == strcmp(sh_name, orte_plm_rsh_shell_name[i]) ) { - *shell = (orte_plm_rsh_shell_t)i; - break; - } - } - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: node %s has SHELL: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodename, - (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : (char*)orte_plm_rsh_shell_name[*shell])); - - return rc; -} - -int orte_plm_base_rsh_setup_shell(orte_plm_rsh_shell_t *rshell, - orte_plm_rsh_shell_t *lshell, - char *nodename, char ***argv) -{ - orte_plm_rsh_shell_t remote_shell, local_shell; - struct passwd *p; - char *param; - int rc; - - /* What is our local shell? */ - local_shell = ORTE_PLM_RSH_SHELL_UNKNOWN; - p = getpwuid(getuid()); - if( NULL == p ) { - /* This user is unknown to the system. Therefore, there is no reason we - * spawn whatsoever in his name. Give up with a HUGE error message. - */ - orte_show_help( "help-plm-rshd.txt", "unknown-user", true, (int)getuid() ); - return ORTE_ERR_FATAL; - } - param = p->pw_shell; - local_shell = find_shell(p->pw_shell); - - /* If we didn't find it in getpwuid(), try looking at the $SHELL - environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) - */ - if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell && - NULL != (param = getenv("SHELL"))) { - local_shell = find_shell(param); - } - - if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell) { - opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", - (NULL != param) ? param : "unknown"); - local_shell = ORTE_PLM_RSH_SHELL_BASH; - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: local shell: %d (%s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - local_shell, orte_plm_rsh_shell_name[local_shell])); - - /* What is our remote shell? */ - if (orte_assume_same_shell) { - remote_shell = local_shell; - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: assuming same remote shell as local shell", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - rc = orte_plm_base_rsh_shell_probe(nodename, &remote_shell); - - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_PLM_RSH_SHELL_UNKNOWN == remote_shell) { - opal_output(0, "WARNING: shell probe returned unhandled shell; assuming bash\n"); - remote_shell = ORTE_PLM_RSH_SHELL_BASH; - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:base: remote shell: %d (%s)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - remote_shell, orte_plm_rsh_shell_name[remote_shell])); - - /* Do we need to source .profile on the remote side? - - sh: yes (see bash(1)) - - ksh: yes (see ksh(1)) - - bash: no (see bash(1)) - - [t]csh: no (see csh(1) and tcsh(1)) - - zsh: no (see http://zsh.sourceforge.net/FAQ/zshfaq03.html#l19) - */ - - if (ORTE_PLM_RSH_SHELL_SH == remote_shell || - ORTE_PLM_RSH_SHELL_KSH == remote_shell) { - int i; - char **tmp; - tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' '); - if (NULL == tmp) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - for (i = 0; NULL != tmp[i]; ++i) { - opal_argv_append_nosize(argv, tmp[i]); - } - opal_argv_free(tmp); - } - - /* pass results back */ - *rshell = remote_shell; - *lshell = local_shell; - - return ORTE_SUCCESS; -} - -int orte_plm_base_setup_slave_launch(char *nodename, orte_app_context_t *app, - char *rcmd, char ***argv, char **exec_path) -{ - orte_slave_files_t *slave_node, *tst_node; - opal_list_item_t *item; - char *bootproxy, *cmd, *scp=NULL; - char *exefile=NULL, *basename, *path=NULL; - char *tmp, *dest, *dest_dir, *filenm; - char **files; - char cwd[OPAL_PATH_MAX]; - int rc, i, j; - char *lib_base, *bin_base; - orte_plm_rsh_shell_t rshell, lshell; - char **tmpargv=NULL; - char *opal_prefix; - - /* set default */ - *exec_path = NULL; - *argv = NULL; - - /* Figure out the basenames for the libdir and bindir. This - requires some explanation: - - - Use opal_install_dirs.libdir and opal_install_dirs.bindir. - - - After a discussion on the devel-core mailing list, the - developers decided that we should use the local directory - basenames as the basis for the prefix on the remote note. - This does not handle a few notable cases (e.g., if the - libdir/bindir is not simply a subdir under the prefix, if the - libdir/bindir basename is not the same on the remote node as - it is here on the local node, etc.), but we decided that - --prefix was meant to handle "the common case". If you need - something more complex than this, a) edit your shell startup - files to set PATH/LD_LIBRARY_PATH properly on the remove - node, or b) use some new/to-be-defined options that - explicitly allow setting the bindir/libdir on the remote - node. We decided to implement these options (e.g., - --remote-bindir and --remote-libdir) to orterun when it - actually becomes a problem for someone (vs. a hypothetical - situation). - - Hence, for now, we simply take the basename of this install's - libdir and bindir and use it to append this install's prefix - and use that on the remote node. - */ - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - opal_prefix = getenv("OPAL_PREFIX"); - - /* have we launched anything on this node before? */ - slave_node = NULL; - for (item = opal_list_get_first(&orte_plm_globals.slave_files); - item != opal_list_get_end(&orte_plm_globals.slave_files); - item = opal_list_get_next(item)) { - tst_node = (orte_slave_files_t*)item; - if (0 == strcmp(tst_node->node, nodename)) { - slave_node = tst_node; - break; - } - } - if (NULL == slave_node) { - slave_node = OBJ_NEW(orte_slave_files_t); - slave_node->node = strdup(nodename); - /* save the bootproxy cmd */ - slave_node->bootproxy = strdup(rcmd); - /* is this a local operation? */ - if (0 == strcmp(orte_process_info.nodename, nodename) || - 0 == strcmp(nodename, "localhost") || - opal_ifislocal(nodename)) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: node %s is local", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - slave_node->local = true; - /* use the prefix, if given */ - if (NULL != app->prefix_dir) { - asprintf(&slave_node->prefix, "%s/%s", app->prefix_dir, bin_base); - } else { - /* use our install dirs */ - slave_node->prefix = strdup(opal_install_dirs.bindir); - } - /* no need to preposition the remote cmd, and no need to remove it */ - slave_node->positioned = false; - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: setting prefix to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->prefix)); - } else { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: node %s is remote", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - /* setup the correct shell info */ - if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_setup_shell(&rshell, &lshell, - nodename, &tmpargv))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(slave_node); - return rc; - } - /* we now need to assemble the actual cmd that will be executed - this depends - * upon whether or not a prefix directory is being used - */ - if (NULL != app->prefix_dir) { - /* if we have a prefix directory, we need to set the PATH and - * LD_LIBRARY_PATH on the remote node, and prepend the eventual cmd - * with the prefix directory - */ - if (ORTE_PLM_RSH_SHELL_SH == rshell || - ORTE_PLM_RSH_SHELL_KSH == rshell || - ORTE_PLM_RSH_SHELL_ZSH == rshell || - ORTE_PLM_RSH_SHELL_BASH == rshell) { - asprintf (&slave_node->prefix, - "%s%s%s PATH=%s/%s:$PATH ; export PATH ; " - "LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; " - "%s/%s", - (opal_prefix != NULL ? "OPAL_PREFIX=" : ""), - (opal_prefix != NULL ? opal_prefix : ""), - (opal_prefix != NULL ? " ; export OPAL_PREFIX;" : ""), - app->prefix_dir, bin_base, - app->prefix_dir, lib_base, - app->prefix_dir, bin_base); - } else if (ORTE_PLM_RSH_SHELL_TCSH == rshell || - ORTE_PLM_RSH_SHELL_CSH == rshell) { - /* [t]csh is a bit more challenging -- we - have to check whether LD_LIBRARY_PATH - is already set before we try to set it. - Must be very careful about obeying - [t]csh's order of evaluation and not - using a variable before it is defined. - See this thread for more details: - http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ - asprintf (&slave_node->prefix, - "%s%s%s set path = ( %s/%s $path ) ; " - "if ( $?LD_LIBRARY_PATH == 1 ) " - "set OMPI_have_llp ; " - "if ( $?LD_LIBRARY_PATH == 0 ) " - "setenv LD_LIBRARY_PATH %s/%s ; " - "if ( $?OMPI_have_llp == 1 ) " - "setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; " - "%s/%s", - (opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""), - (opal_prefix != NULL ? opal_prefix : ""), - (opal_prefix != NULL ? " ;" : ""), - app->prefix_dir, bin_base, - app->prefix_dir, lib_base, - app->prefix_dir, lib_base, - app->prefix_dir, bin_base); - } else { - orte_show_help("help-plm-rshd.txt", "cannot-resolve-shell-with-prefix", true, - (NULL == opal_prefix) ? "NULL" : opal_prefix, - app->prefix_dir); - return ORTE_ERR_SILENT; - } - /* since we have a prefix, we don't need to preposition the bootproxy - * or remove it later - */ - slave_node->positioned = false; - } else if (NULL != app->preload_files_dest_dir) { - /* the prefix will be the same as the preload destination */ - slave_node->prefix = strdup(app->preload_files_dest_dir); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } else if (NULL != orte_process_info.tmpdir_base) { - /* use the tmpdir base */ - slave_node->prefix = strdup(orte_process_info.tmpdir_base); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } else { - /* we have to preposition somewhere - default to /tmp */ - slave_node->prefix = strdup("/tmp"); - /* flag to preload it, and remove it later */ - slave_node->positioned = true; - } - - /* do we need to preload the bootproxy on this node? */ - if (slave_node->positioned) { - /* find the local bootproxy */ - bootproxy = opal_find_absolute_path(rcmd); - if (NULL == bootproxy) { - orte_show_help("help-plm-base.txt", "bootproxy-not-found", true, rcmd); - return ORTE_ERR_NOT_FOUND; - } - path = opal_os_path(false, slave_node->prefix, rcmd, NULL); - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - /* form and execute the scp command */ - asprintf(&cmd, "%s %s %s:%s", scp, bootproxy, nodename, path); - system(cmd); - free(cmd); - free(path); - free(bootproxy); - } - } - /* add this node to our list */ - opal_list_append(&orte_plm_globals.slave_files, &slave_node->super); - } - - /* if we are going to position the binary or files, did they give us a dest? */ - if (NULL != app->preload_files_dest_dir) { - /* the target location -must- be an absolute path */ - if (!opal_path_is_absolute(app->preload_files_dest_dir)) { - orte_show_help("help-plm-base.txt", "abs-path-reqd", true, app->preload_files_dest_dir); - return ORTE_ERROR; - } - dest_dir = app->preload_files_dest_dir; - /* if this is a local op, make sure this location exists. we can't - * do this for remote ops as there is no way to create a remote - * directory - */ - if (slave_node->local) { - if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(dest_dir, S_IRWXU))) { - orte_show_help("help-plm-base.txt", "path-not-created", true, dest_dir); - return rc; - } - } - } else if (NULL != orte_process_info.tmpdir_base) { - /* put everything in the tmpdir base */ - dest_dir = orte_process_info.tmpdir_base; - } else { - /* put everything in /tmp */ - dest_dir = "/tmp"; - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: destination dir set to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dest_dir)); - - /* setup the exec_path to the bootproxy */ - if (slave_node->local) { - /* if this is a local operation, then just set - * the exec_path to be the bootproxy - */ - *argv = NULL; - asprintf(exec_path, "%s/%s", slave_node->prefix, rcmd); - opal_argv_append_nosize(argv, *exec_path); - } else { - /* set the exec path to the rsh agent path */ - *exec_path = strdup(orte_plm_globals.rsh_agent_path); - /* Start the argv with the rsh/ssh command */ - *argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - /* add the hostname */ - opal_argv_append_nosize(argv, nodename); - /* add the bootproxy cmd */ - if (NULL != slave_node->prefix) { - asprintf(&tmp, "%s/%s", slave_node->prefix, rcmd); - } else { - tmp = strdup(rcmd); - } - opal_argv_append_nosize(argv, tmp); - free(tmp); - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: exec_path set to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *exec_path)); - - /* do we need to preload the binary? */ - if (app->preload_binary) { - char * src; - /* if the binary is not given in absolute path form, - * then convert it to one - */ - if (!opal_path_is_absolute(app->app)) { - /* see if a source directory was given */ - if (NULL!= app->preload_files_src_dir) { - /* prepend the src dir to the executable name */ - path = opal_os_path(false, app->preload_files_src_dir, app->app, NULL); - /* now check for the existence of the app */ - src = opal_find_absolute_path(path); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, path); - return ORTE_ERROR; - } - } else { - /* look for it in the cwd */ - getcwd(cwd, OPAL_PATH_MAX); - src = opal_path_access(app->app, cwd, X_OK); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, cwd); - return ORTE_ERROR; - } - } - } else { - src = opal_path_access(app->app, NULL, X_OK); - if (NULL == src) { - orte_show_help("help-plm-base.txt", "exec-not-found", true, app->app); - return ORTE_ERROR; - } - } - /* get the basename */ - basename = opal_basename(app->app); - - /* define the destination */ - dest = opal_os_path(false, dest_dir, basename, NULL); - - /* - * We do not test for error after opal_basename -- this is fine, as opal_os_path - * is taking a NULL terminated list -- in case of error, well dest_dir is the final dir. - * However, we need to free basename here, before overwriting the pointer later. - */ - if (basename != NULL) { - free(basename); - } - - /* has this binary already been positioned? */ - for (i=0; i < slave_node->apps.size; i++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->apps, i)) && - 0 == strcmp(filenm, dest)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: app %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto PRELOAD_FILES; - } - } - /* add the app to the slave_node list */ - opal_pointer_array_add(&slave_node->apps, strdup(dest)); - /* since we are positioning the binary, add it to the list - * of files to be cleaned up when done - */ - opal_pointer_array_add(&slave_node->files, strdup(dest)); - - /* if this is a local node, then we just use the cp command */ - if (slave_node->local) { - scp = opal_find_absolute_path("cp"); - if (NULL == scp) { - free (src); - orte_show_help("help-plm-base.txt", "cp-not-found", true, "cp", "cp"); - return ORTE_ERROR; - } - /* form and execute the cp commands */ - asprintf(&cmd, "%s %s %s", scp, src, dest); - system(cmd); - free(cmd); - } else { - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - free (src); - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - /* form and execute the scp commands */ - asprintf(&cmd, "%s %s %s:%s", scp, src, nodename, dest); - system(cmd); - free(cmd); - } - free(src); - free(dest); - free(scp); - } else { - /* we don't need to pre-position the binary, but we do need - * to check if we should record it - */ - for (i=0; i < slave_node->apps.size; i++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->apps, i)) && - 0 == strcmp(filenm, app->app)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: app %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto PRELOAD_FILES; - } - } - /* add the app to the slave_node list */ - opal_pointer_array_add(&slave_node->apps, strdup(app->app)); - /* do not add it to the files to be cleaned up when done as - * we are not positioning it! - */ - } - -PRELOAD_FILES: - /* do we need to pre-position supporting files? */ - if (NULL != app->preload_files) { - if (slave_node->local) { - scp = opal_find_absolute_path("cp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "cp", "cp"); - return ORTE_ERROR; - } - } else { - /* find the scp command */ - scp = opal_find_absolute_path("scp"); - if (NULL == scp) { - orte_show_help("help-plm-base.txt", "cp-not-found", true, "scp", "scp"); - return ORTE_ERROR; - } - } - /* break apart the comma-separated list of files */ - files = opal_argv_split(app->preload_files, ','); - /* copy each file across */ - for (i=0; i < opal_argv_count(files); i++) { - /* if the file is not given in absolute path form, - * then convert it to one - */ - if (!opal_path_is_absolute(files[i])) { - /* see if a source directory was given */ - if (NULL!= app->preload_files_src_dir) { - /* look for the file there */ - exefile = opal_path_access(files[i], app->preload_files_src_dir, R_OK); - } else { - /* look for it in the cwd */ - getcwd(cwd, OPAL_PATH_MAX); - exefile = opal_path_access(files[i], cwd, R_OK); - } - } else { - exefile = opal_path_access(files[i], NULL, R_OK); - } - if (NULL == exefile) { - getcwd(cwd, OPAL_PATH_MAX); - orte_show_help("help-plm-base.txt", "file-not-found", true, files[i], - (NULL == app->preload_files_src_dir) ? cwd : app->preload_files_src_dir); - return ORTE_ERROR; - } - /* define the destination */ - dest = opal_os_path(false, dest_dir, files[i], NULL); - /* has this file already been positioned? */ - for (j=0; j < slave_node->files.size; j++) { - if (NULL != (filenm = opal_pointer_array_get_item(&slave_node->files, j)) && - 0 == strcmp(filenm, dest)) { - /* this app already has been positioned on the node - skip it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:local:slave: file %s already positioned", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filenm)); - goto SKIP; - } - } - /* add the file to the slave_node list */ - opal_pointer_array_add(&slave_node->files, strdup(dest)); - if (slave_node->local) { - /* form and execute the cp command */ - asprintf(&cmd, "%s %s %s", scp, exefile, dest); - system(cmd); - free(cmd); - } else { - /* form and execute the scp commands */ - asprintf(&cmd, "%s -q %s %s:%s", scp, exefile, nodename, dest); - system(cmd); - free(cmd); - } - SKIP: - free(exefile); - free(dest); - } - opal_argv_free(files); - free(scp); - } - - return ORTE_SUCCESS; -} - -int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr, - char *nodename, - int *node_name_index1, - int *proc_vpid_index, char *prefix_dir, - char *nodes) -{ - int argc; - char **argv; - char *param; - orte_plm_rsh_shell_t remote_shell, local_shell; - char *lib_base, *bin_base; - int orted_argc; - char **orted_argv; - char *orted_cmd, *orted_prefix, *final_cmd; - int orted_index; - int rc; - - - /* Figure out the basenames for the libdir and bindir. This - requires some explanation: - - - Use opal_install_dirs.libdir and opal_install_dirs.bindir. - - - After a discussion on the devel-core mailing list, the - developers decided that we should use the local directory - basenames as the basis for the prefix on the remote note. - This does not handle a few notable cases (e.g., if the - libdir/bindir is not simply a subdir under the prefix, if the - libdir/bindir basename is not the same on the remote node as - it is here on the local node, etc.), but we decided that - --prefix was meant to handle "the common case". If you need - something more complex than this, a) edit your shell startup - files to set PATH/LD_LIBRARY_PATH properly on the remove - node, or b) use some new/to-be-defined options that - explicitly allow setting the bindir/libdir on the remote - node. We decided to implement these options (e.g., - --remote-bindir and --remote-libdir) to orterun when it - actually becomes a problem for someone (vs. a hypothetical - situation). - - Hence, for now, we simply take the basename of this install's - libdir and bindir and use it to append this install's prefix - and use that on the remote node. - */ - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - - /* - * Build argv array - */ - argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv); - argc = opal_argv_count(orte_plm_globals.rsh_agent_argv); - *node_name_index1 = argc; - opal_argv_append(&argc, &argv, "