1
1

As promised, rationalize the rsh support. Remove rshbase and the base rsh support, centralizing all rsh support into the rsh component. Remove the "slave" launch support as that experiment is complete. Fix tree spawn and make that the default method for rsh launch, turning it "off" for qrsh as that system does not support tree spawn.

This commit was SVN r25507.
Этот коммит содержится в:
Ralph Castain 2011-11-26 02:33:05 +00:00
родитель a841ee2ae7
Коммит b475421c16
44 изменённых файлов: 665 добавлений и 5186 удалений

Просмотреть файл

@ -4,6 +4,8 @@
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -28,7 +30,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
orte/tools/orte-ps/Makefile
orte/tools/orte-clean/Makefile
orte/tools/orte-top/Makefile
orte/tools/orte-bootproxy/Makefile
orte/tools/orte-migrate/Makefile
orte/tools/orte-info/Makefile
])

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

Просмотреть файл

@ -1,45 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
ess_slave.h \
ess_slave_component.c \
ess_slave_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_ess_slave_DSO
component_noinst =
component_install = mca_ess_slave.la
else
component_noinst = libmca_ess_slave.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_ess_slave_la_SOURCES = $(sources)
mca_ess_slave_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_ess_slave_la_SOURCES =$(sources)
libmca_ess_slave_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,21 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_orte_ess_slave_PRIORITY], [10])
# MCA_ess_slave_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_ess_slave_CONFIG], [
AC_CONFIG_FILES([orte/mca/ess/slave/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

Просмотреть файл

@ -1,36 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_ESS_SLAVE_H
#define ORTE_ESS_SLAVE_H
BEGIN_C_DECLS
/*
* Module open / close
*/
int orte_ess_slave_component_open(void);
int orte_ess_slave_component_close(void);
int orte_ess_slave_component_query(mca_base_module_t **module, int *priority);
ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_slave_component;
END_C_DECLS
#endif /* ORTE_ESS_SLAVE_H */

Просмотреть файл

@ -1,85 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/slave/ess_slave.h"
extern orte_ess_base_module_t orte_ess_slave_module;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_ess_base_component_t mca_ess_slave_component = {
{
ORTE_ESS_BASE_VERSION_2_0_0,
/* Component name and version */
"slave",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_ess_slave_component_open,
orte_ess_slave_component_close,
orte_ess_slave_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
int
orte_ess_slave_component_open(void)
{
return ORTE_SUCCESS;
}
int orte_ess_slave_component_query(mca_base_module_t **module, int *priority)
{
/* we are the slave module, so set the priority so
* we can only be selected if directed to do so
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_ess_slave_module;
return ORTE_SUCCESS;
}
int
orte_ess_slave_component_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,534 +0,0 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <sys/types.h>
#include <stdio.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "opal/mca/event/event.h"
#include "opal/runtime/opal.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/util/show_help.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/malloc.h"
#include "orte/mca/rml/base/base.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#endif
#include "orte/mca/filem/base/base.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_cr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/ess/slave/ess_slave.h"
static int slave_set_name(void);
static int rte_init(void);
static int rte_finalize(void);
static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc);
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_pidmap(opal_byte_object_t *bo);
static int update_nidmap(opal_byte_object_t *bo);
#if OPAL_ENABLE_FT_CR == 1
static int rte_ft_event(int state);
static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t pid);
#endif
orte_ess_base_module_t orte_ess_slave_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort,
proc_get_locality,
proc_get_daemon,
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
#if OPAL_ENABLE_FT_CR == 1
rte_ft_event
#else
NULL
#endif
};
static int rte_init(void)
{
int ret;
char *error = NULL;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog";
goto error;
}
/* Start by getting a unique name from the enviro */
slave_set_name();
/* use the default procedure to finish my setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_app_setup";
goto error;
}
/* init my nidmap arrays - no data can be available, but
* we want to ensure that nobody else who looks at
* those arrays will segfault
*/
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
error:
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
return ret;
}
static int rte_finalize(void)
{
int ret;
/* use the default procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
}
/* deconstruct the nidmap and jobmap arrays */
orte_util_nidmap_finalize();
return ret;
}
static opal_paffinity_locality_t proc_get_locality(orte_process_name_t *proc)
{
/* no proc can be local */
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:slave: proc %s is REMOTE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return OPAL_PROC_NON_LOCAL;
}
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the answer is my daemon's vpid */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return ORTE_PROC_MY_DAEMON->vpid;
}
/* otherwise, no idea */
return ORTE_VPID_INVALID;
}
static char* proc_get_hostname(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the answer is my nodename */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return orte_process_info.nodename;
}
/* otherwise, no idea */
return NULL;
}
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the local rank is zero */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return 0;
}
/* otherwise, no idea */
return ORTE_LOCAL_RANK_INVALID;
}
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
/* if it is me, the node rank is zero */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
return 0;
}
/* otherwise, no idea */
return ORTE_NODE_RANK_INVALID;
}
static int update_pidmap(opal_byte_object_t *bo)
{
return ORTE_SUCCESS;
}
static int update_nidmap(opal_byte_object_t *bo)
{
return ORTE_SUCCESS;
}
static int slave_set_name(void)
{
char *jobid_str, *procid_str;
int id, rc;
orte_jobid_t jobid;
orte_vpid_t vpid;
id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL);
mca_base_param_lookup_string(id, &jobid_str);
if (NULL == jobid_str) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
free(jobid_str);
id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL);
mca_base_param_lookup_string(id, &procid_str);
if (NULL == procid_str) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
free(procid_str);
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get the non-name common environmental variables */
if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
#if OPAL_ENABLE_FT_CR == 1
static int rte_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
orte_proc_type_t svtype;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* This should follow the ess init() function
*/
/*
* Clear nidmap and jmap
*/
orte_util_nidmap_finalize();
/*
* - Reset Contact information
*/
if( ORTE_SUCCESS != (ret = slave_set_name() ) ) {
exit_status = ret;
}
/*
* Notify RML -> OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Restart the routed framework
* JJH: Lie to the finalize function so it does not try to contact the daemon.
*/
svtype = orte_process_info.proc_type;
orte_process_info.proc_type = ORTE_PROC_TOOL;
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
exit_status = ret;
goto cleanup;
}
orte_process_info.proc_type = svtype;
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
/*
* Group Comm - Clean out stale data
*/
orte_grpcomm.finalize();
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
exit_status = ret;
goto cleanup;
}
/*
* Restart the PLM - Does nothing at the moment, but included for completeness
*/
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* RML - Enable communications
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
exit_status = ret;
goto cleanup;
}
/*
* Session directory re-init
*/
if (orte_create_session_dirs) {
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_process_info.nodename,
NULL, /* Batch ID -- Not used */
ORTE_PROC_MY_NAME))) {
exit_status = ret;
}
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
}
/*
* Notify Routed
*/
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify SnapC
*/
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Send new PID to HNP/daemon
* The checkpointer could have used a proxy program to boot us
* so the pid that the orted got from fork() may not be the
* PID of this application.
* - Note: BLCR does this because it tries to preseve the PID
* of the program across checkpointes
*/
if( ORTE_SUCCESS != (ret = ess_slave_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) {
exit_status = ret;
goto cleanup;
}
/* if one was provided, build my nidmap */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
cleanup:
return exit_status;
}
static int ess_slave_ft_event_update_process_info(orte_process_name_t proc, pid_t proc_pid)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_snapc_cmd_flag_t command = ORTE_SNAPC_LOCAL_UPDATE_CMD;
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_CMD )) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc_pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_SNAPC, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}
#endif

Просмотреть файл

@ -28,8 +28,7 @@ if !ORTE_DISABLE_FULL_SUPPORT
dist_pkgdata_DATA += base/help-plm-base.txt
headers += \
base/plm_private.h \
base/plm_base_rsh_support.h
base/plm_private.h
libmca_plm_la_SOURCES += \
base/plm_base_close.c \
@ -38,6 +37,5 @@ libmca_plm_la_SOURCES += \
base/plm_base_launch_support.c \
base/plm_base_jobid.c \
base/plm_base_proxy.c \
base/plm_base_orted_cmds.c \
base/plm_base_rsh_support.c
base/plm_base_orted_cmds.c
endif

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -30,7 +32,6 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/plm_base_rsh_support.h"
int orte_plm_base_finalize(void)
{
@ -66,21 +67,6 @@ int orte_plm_base_close(void)
OBJ_DESTRUCT(&orte_plm_globals.spawn_lock);
OBJ_DESTRUCT(&orte_plm_globals.spawn_cond);
#ifndef __WINDOWS__
/* clearout the rsh support */
orte_plm_base_local_slave_finalize();
#endif
/* remove the rsh agent info */
if (NULL != orte_plm_globals.rsh_agent_argv) {
opal_argv_free(orte_plm_globals.rsh_agent_argv);
}
if (NULL != orte_plm_globals.rsh_agent_path) {
free(orte_plm_globals.rsh_agent_path);
}
OBJ_DESTRUCT(&orte_plm_globals.slave_files);
/* Close all open components */
mca_base_components_close(orte_plm_globals.output,
&orte_plm_base.available_components, NULL);

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -55,44 +57,6 @@ int orte_plm_base_open(void)
#else
static void slave_file_construct(orte_slave_files_t *ptr)
{
ptr->node = NULL;
ptr->local = false;
ptr->prefix = NULL;
ptr->bootproxy = NULL;
ptr->positioned = false;
OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t);
opal_pointer_array_init(&ptr->apps, 8, 1024, 8);
OBJ_CONSTRUCT(&ptr->files, opal_pointer_array_t);
opal_pointer_array_init(&ptr->files, 8, 1024, 8);
}
static void slave_file_destruct(orte_slave_files_t *ptr)
{
int i;
char *cptr;
if (NULL != ptr->node) free(ptr->node);
if (NULL != ptr->prefix) free(ptr->prefix);
if (NULL != ptr->bootproxy) free(ptr->bootproxy);
for (i=0; i < ptr->apps.size; i++) {
if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->apps, i))) {
free(cptr);
}
}
OBJ_DESTRUCT(&ptr->apps);
for (i=0; i < ptr->files.size; i++) {
if (NULL != (cptr = (char*)opal_pointer_array_get_item(&ptr->files, i))) {
free(cptr);
}
}
OBJ_DESTRUCT(&ptr->files);
}
OBJ_CLASS_INSTANCE(orte_slave_files_t,
opal_list_item_t,
slave_file_construct,
slave_file_destruct);
/*
* Global public variables
*/
@ -146,12 +110,6 @@ int orte_plm_base_open(void)
/* init the next jobid */
orte_plm_globals.next_jobid = 1;
/* init the rsh support */
orte_plm_globals.rsh_agent_argv = NULL;
orte_plm_globals.rsh_agent_path = NULL;
orte_plm_globals.local_slaves = 0;
OBJ_CONSTRUCT(&orte_plm_globals.slave_files, opal_list_t);
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,89 +0,0 @@
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_PLM_RSH_SUPPORT_H
#define MCA_PLM_RSH_SUPPORT_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/types.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/class/opal_list.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/threads/condition.h"
#include "opal/dss/dss_types.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
typedef enum {
ORTE_PLM_RSH_SHELL_BASH = 0,
ORTE_PLM_RSH_SHELL_ZSH,
ORTE_PLM_RSH_SHELL_TCSH,
ORTE_PLM_RSH_SHELL_CSH,
ORTE_PLM_RSH_SHELL_KSH,
ORTE_PLM_RSH_SHELL_SH,
ORTE_PLM_RSH_SHELL_UNKNOWN
} orte_plm_rsh_shell_t;
ORTE_DECLSPEC extern const char *orte_plm_rsh_shell_name[7];
/* rsh launch support */
ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(const char *agent_list, char *path);
ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_lookup(const char *agent_list, char *path);
ORTE_DECLSPEC int orte_plm_base_rsh_shell_probe(char *nodename, orte_plm_rsh_shell_t *shell);
ORTE_DECLSPEC int orte_plm_base_rsh_setup_shell(orte_plm_rsh_shell_t *rshell,
orte_plm_rsh_shell_t *lshell,
char *nodename, char ***argv);
ORTE_DECLSPEC int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr,
char *nodename,
int *node_name_index1,
int *proc_vpid_index, char *prefix_dir,
char *nodes);
ORTE_DECLSPEC void orte_plm_base_ssh_child(int argc, char **argv,
orte_vpid_t vpid, int proc_vpid_index);
/**
* Local slave launch
*/
ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata);
ORTE_DECLSPEC void orte_plm_base_local_slave_finalize(void);
ORTE_DECLSPEC int orte_plm_base_setup_slave_launch(char *nodename, orte_app_context_t *app,
char *rcmd, char ***argv, char **exec_path);
ORTE_DECLSPEC int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
orte_jobid_t jobid, orte_vpid_t vpid,
int num_nodes, orte_vpid_t num_procs,
orte_node_rank_t nrank, orte_local_rank_t lrank,
orte_vpid_t nlocal, int nslots, bool overwrite);
END_C_DECLS
#endif /* MCA_PLM_RSH_SUPPORT_H */

Просмотреть файл

@ -33,6 +33,7 @@
#include "opal/class/opal_list.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss_types.h"
#include "opal/threads/condition.h"
#include "opal/dss/dss_types.h"
@ -44,19 +45,6 @@
BEGIN_C_DECLS
/* types for use solely within PLM framework */
typedef struct {
opal_list_item_t super;
char *node;
bool local;
char *prefix;
char *bootproxy;
bool positioned;
opal_pointer_array_t apps;
opal_pointer_array_t files;
} orte_slave_files_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_slave_files_t);
/* globals for use solely within PLM framework */
typedef struct {
/** Verbose/debug output stream */
@ -69,14 +57,6 @@ typedef struct {
uint16_t next_jobid;
/* time when daemons started launch */
struct timeval daemonlaunchstart;
/* rsh launch agent path */
char *rsh_agent_path;
/* rsh launch agent argv */
char **rsh_agent_argv;
/* jobid for local slaves */
orte_jobid_t local_slaves;
/* list of local slave files */
opal_list_t slave_files;
/* spawn lock */
opal_mutex_t spawn_lock;
/* spawn cond */
@ -89,6 +69,8 @@ typedef struct {
opal_condition_t spawn_in_progress_cond;
/* flag */
bool spawn_in_progress;
/* tree spawn cmd */
opal_buffer_t tree_spawn_cmd;
} orte_plm_globals_t;
/**
* Global instance of PLM framework data

Просмотреть файл

@ -38,26 +38,6 @@
BEGIN_C_DECLS
/*
* Module open / close
*/
int orte_plm_rsh_component_open(void);
int orte_plm_rsh_component_close(void);
int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority);
/*
* Startup / Shutdown
*/
int orte_plm_rsh_finalize(void);
/*
* Interface
*/
int orte_plm_rsh_init(void);
int orte_plm_rsh_launch(orte_job_t *jdata);
int orte_plm_rsh_terminate_orteds(void);
int orte_plm_rsh_signal_job(orte_jobid_t, int32_t);
/**
* PLS Component
*/
@ -73,17 +53,19 @@ struct orte_plm_rsh_component_t {
int delay;
int priority;
bool tree_spawn;
opal_list_t children;
orte_std_cntr_t num_children;
orte_std_cntr_t num_concurrent;
size_t num_concurrent;
opal_mutex_t lock;
opal_condition_t cond;
char *agent;
bool assume_same_shell;
};
typedef struct orte_plm_rsh_component_t orte_plm_rsh_component_t;
ORTE_MODULE_DECLSPEC extern orte_plm_rsh_component_t mca_plm_rsh_component;
extern orte_plm_base_module_t orte_plm_rsh_module;
ORTE_MODULE_DECLSPEC char **orte_plm_rsh_search(const char* agent_list, const char *path);
END_C_DECLS
#endif /* ORTE_PLS_RSH_EXPORT_H */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights
@ -50,7 +50,6 @@
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/plm_base_rsh_support.h"
#include "orte/mca/plm/rsh/plm_rsh.h"
@ -61,6 +60,11 @@ const char *mca_plm_rsh_component_version_string =
"Open MPI rsh plm MCA component version " ORTE_VERSION;
static int rsh_component_open(void);
static int rsh_component_query(mca_base_module_t **module, int *priority);
static int rsh_component_close(void);
static int rsh_launch_agent_lookup(const char *agent_list, char *path);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
@ -81,9 +85,9 @@ orte_plm_rsh_component_t mca_plm_rsh_component = {
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_plm_rsh_component_open,
orte_plm_rsh_component_close,
orte_plm_rsh_component_query
rsh_component_open,
rsh_component_close,
rsh_component_query
},
{
/* The component is checkpoint ready */
@ -94,16 +98,14 @@ orte_plm_rsh_component_t mca_plm_rsh_component = {
int orte_plm_rsh_component_open(void)
static int rsh_component_open(void)
{
int tmp;
int tmp, value;
mca_base_component_t *c = &mca_plm_rsh_component.super.base_version;
/* initialize globals */
OBJ_CONSTRUCT(&mca_plm_rsh_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_plm_rsh_component.cond, opal_condition_t);
mca_plm_rsh_component.num_children = 0;
OBJ_CONSTRUCT(&mca_plm_rsh_component.children, opal_list_t);
mca_plm_rsh_component.using_qrsh = false;
mca_plm_rsh_component.using_llspawn = false;
@ -150,19 +152,36 @@ int orte_plm_rsh_component_open(void)
"Delay (in seconds) between invocations of the remote agent, but only used when the \"debug\" MCA parameter is true, or the top-level MCA debugging is enabled (otherwise this value is ignored)",
false, false, 1,
&mca_plm_rsh_component.delay);
#if 0
/* NEEDS TO BE FIXED */
mca_base_param_reg_int(c, "tree_spawn",
"If set to 1, launch via a tree-based topology",
false, false, (int)false, &tmp);
mca_plm_rsh_component.tree_spawn = OPAL_INT_TO_BOOL(tmp);
#endif
mca_plm_rsh_component.tree_spawn = false;
mca_base_param_reg_int(c, "no_tree_spawn",
"If set to 1, do not launch via a tree-based topology",
false, false, 0, &tmp);
if (0 == tmp) {
mca_plm_rsh_component.tree_spawn = true;
} else {
mca_plm_rsh_component.tree_spawn = false;
}
/* local rsh/ssh launch agent */
tmp = mca_base_param_reg_string(c, "agent",
"The command used to launch executables on remote nodes (typically either \"ssh\" or \"rsh\")",
false, false, "ssh : rsh", NULL);
mca_base_param_reg_syn_name(tmp, "pls", "rsh_agent", true);
mca_base_param_reg_syn_name(tmp, "orte", "rsh_agent", true);
mca_base_param_lookup_string(tmp, &mca_plm_rsh_component.agent);
tmp = mca_base_param_reg_int_name("orte", "assume_same_shell",
"If set to 1, assume that the shell on the remote node is the same as the shell on the local node. Otherwise, probe for what the remote shell [default: 1]",
false, false, 1, NULL);
mca_base_param_reg_syn_name(tmp, "plm", "rsh_assume_same_shell", true);
mca_base_param_lookup_int(tmp, &value);
mca_plm_rsh_component.assume_same_shell = OPAL_INT_TO_BOOL(value);
return ORTE_SUCCESS;
}
int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
static int rsh_component_query(mca_base_module_t **module, int *priority)
{
char *tmp;
@ -174,7 +193,7 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
/* setup the search path for qrsh */
asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
/* see if the agent is available */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("qrsh", tmp)) {
if (ORTE_SUCCESS != rsh_launch_agent_lookup("qrsh", tmp)) {
/* can't be SGE */
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: SGE indicated but cannot find path "
@ -186,12 +205,14 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
}
free(tmp);
mca_plm_rsh_component.using_qrsh = true;
/* no tree spawn allowed under qrsh */
mca_plm_rsh_component.tree_spawn = false;
goto success;
} else if (!mca_plm_rsh_component.disable_llspawn &&
NULL != getenv("LOADL_STEP_ID")) {
/* We are running as a LOADLEVELER job.
Search for llspawn in the users PATH */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("llspawn", NULL)) {
if (ORTE_SUCCESS != rsh_launch_agent_lookup("llspawn", NULL)) {
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: LoadLeveler "
"indicated but cannot find path or execution "
@ -207,13 +228,13 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
/* if this isn't an Grid Engine or LoadLeveler environment,
see if MCA-specified agent (default: ssh:rsh) is available */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
if (ORTE_SUCCESS != rsh_launch_agent_lookup(NULL, NULL)) {
/* this isn't an error - we just cannot be selected */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: cannot find path "
"for launching agent \"%s\"\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_rsh_agent));
mca_plm_rsh_component.agent));
*module = NULL;
return ORTE_ERROR;
}
@ -225,12 +246,87 @@ success:
}
int orte_plm_rsh_component_close(void)
static int rsh_component_close(void)
{
/* cleanup state */
OBJ_DESTRUCT(&mca_plm_rsh_component.lock);
OBJ_DESTRUCT(&mca_plm_rsh_component.cond);
OBJ_DESTRUCT(&mca_plm_rsh_component.children);
return ORTE_SUCCESS;
}
/*
* Take a colon-delimited list of agents and locate the first one that
* we are able to find in the PATH. Split that one into argv and
* return it. If nothing found, then return NULL.
*/
char **orte_plm_rsh_search(const char* agent_list, const char *path)
{
int i, j;
char *line, **lines;
char **tokens, *tmp;
char cwd[OPAL_PATH_MAX];
if (NULL == path) {
getcwd(cwd, OPAL_PATH_MAX);
} else {
strncpy(cwd, path, OPAL_PATH_MAX);
}
if (NULL == agent_list) {
lines = opal_argv_split(mca_plm_rsh_component.agent, ':');
} else {
lines = opal_argv_split(agent_list, ':');
}
for (i = 0; NULL != lines[i]; ++i) {
line = lines[i];
/* Trim whitespace at the beginning and end of the line */
for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) {
continue;
}
for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) {
line[j] = '\0';
}
if (strlen(line) <= 0) {
continue;
}
/* Split it */
tokens = opal_argv_split(line, ' ');
/* Look for the first token in the PATH */
tmp = opal_path_findv(tokens[0], X_OK, environ, cwd);
if (NULL != tmp) {
free(tokens[0]);
tokens[0] = tmp;
opal_argv_free(lines);
return tokens;
}
/* Didn't find it */
opal_argv_free(tokens);
}
/* Doh -- didn't find anything */
opal_argv_free(lines);
return NULL;
}
static int rsh_launch_agent_lookup(const char *agent_list, char *path)
{
char **tmp;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:rsh_lookup on agent %s path %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == agent_list) ? mca_plm_rsh_component.agent : agent_list,
(NULL == path) ? "NULL" : path));
if (NULL == (tmp = orte_plm_rsh_search(agent_list, path))) {
return ORTE_ERR_NOT_FOUND;
}
/* if we got here, then one of the given agents could be found */
opal_argv_free(tmp);
return ORTE_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,46 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-plm-rshbase.txt
sources = \
plm_rshbase.h \
plm_rshbase_component.c \
plm_rshbase_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_plm_rshbase_DSO
component_noinst =
component_install = mca_plm_rshbase.la
else
component_noinst = libmca_plm_rshbase.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_plm_rshbase_la_SOURCES = $(sources)
mca_plm_rshbase_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_plm_rshbase_la_SOURCES =$(sources)
libmca_plm_rshbase_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,31 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_plm_rshbase_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_plm_rshbase_CONFIG],[
AC_CONFIG_FILES([orte/mca/plm/rshbase/Makefile])
AC_CHECK_FUNC([fork], [plm_rshbase_happy="yes"], [plm_rshbase_happy="no"])
AS_IF([test "$plm_rshbase_happy" = "yes" -a "$orte_without_full_support" = 0], [$1], [$2])
])dnl

Просмотреть файл

@ -1,77 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
[no-local-orted]
The rsh PLS component was not able to find the executable "orted" in
your PATH or in the directory where Open MPI/OpenRTE was initially installed,
and therefore cannot continue.
For reference, your current PATH is:
%s
We also looked for orted in the following directory:
%s
[multiple-prefixes]
Specified multiple application contexts using different
settings for --prefix. Care should be taken, that corresponding
processes are mapped to different nodes. Having multiple prefixes
per node is not allowed.
The previously set prefix was
%s
the prefix to be set overriding:
%s
[concurrency-less-than-zero]
The value of the MCA parameter "pls_rsh_num_concurrent" is less than
or equal to zero (%d). This parameter is used to determine how many
remote agents (typically rsh or ssh) to invoke concurrently while
launching parallel jobs.
This value has automatically be reset to 1; processing will continue.
[deadlock-params]
The rsh launcher has been given a number of %d concurrent daemons to
launch and is in a debug-daemons option. However, the total number of
daemons to launch (%d) is greater than this value. This is a scenario that
will cause the system to deadlock.
To avoid deadlock, either increase the number of concurrent daemons, or
remove the debug-daemons flag.
[unknown-user]
The user (%d) is unknown to the system (i.e. there is no corresponding
entry in the password file). Please contact your system administrator
for a fix.
#
[cannot-resolve-shell-with-prefix]
The rsh launcher has been given a prefix to use, but could not determine
the type of remote shell being used on the remote node. This is a fatal
error as we cannot determine how to construct the cmd line to set your
remote LD_LIBRARY_PATH and PATH environmental variables.
The prefix we were given are:
opal_prefix: %s
prefix_dir: %s

Просмотреть файл

@ -1,58 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
* Part of the rshbase launcher. See plm_rshbase.h for an overview of how it works.
*/
#ifndef ORTE_PLM_RSHBASE_EXPORT_H
#define ORTE_PLM_RSHBASE_EXPORT_H
#include "orte_config.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/threads/condition.h"
#include "opal/mca/mca.h"
#include "orte/mca/plm/plm.h"
BEGIN_C_DECLS
/**
* PLS Component
*/
struct orte_plm_rshbase_component_t {
orte_plm_base_component_t super;
bool force_rsh;
int priority;
orte_std_cntr_t num_concurrent;
opal_mutex_t lock;
opal_condition_t cond;
};
typedef struct orte_plm_rshbase_component_t orte_plm_rshbase_component_t;
ORTE_MODULE_DECLSPEC extern orte_plm_rshbase_component_t mca_plm_rshbase_component;
extern orte_plm_base_module_t orte_plm_rshbase_module;
END_C_DECLS
#endif /* ORTE_PLS_RSHBASE_EXPORT_H */

Просмотреть файл

@ -1,162 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights
* reserved.
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ctype.h>
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/path.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/plm_base_rsh_support.h"
#include "orte/mca/plm/rshbase/plm_rshbase.h"
/*
* Public string showing the plm ompi_rshbase component version number
*/
const char *mca_plm_rshbase_component_version_string =
"Open MPI rshbase plm MCA component version " ORTE_VERSION;
static int rshbase_component_open(void);
static int rshbase_component_query(mca_base_module_t **module, int *priority);
static int rshbase_component_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_plm_rshbase_component_t mca_plm_rshbase_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
{
ORTE_PLM_BASE_VERSION_2_0_0,
/* Component name and version */
"rshbase",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
rshbase_component_open,
rshbase_component_close,
rshbase_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
static int rshbase_component_open(void)
{
int tmp;
mca_base_component_t *c = &mca_plm_rshbase_component.super.base_version;
/* initialize globals */
OBJ_CONSTRUCT(&mca_plm_rshbase_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_plm_rshbase_component.cond, opal_condition_t);
/* lookup parameters */
mca_base_param_reg_int(c, "num_concurrent",
"How many plm_rsh_agent instances to invoke concurrently (must be > 0)",
false, false, 128, &tmp);
if (tmp <= 0) {
orte_show_help("help-plm-rshbase.txt", "concurrency-less-than-zero",
true, tmp);
tmp = 1;
}
mca_plm_rshbase_component.num_concurrent = tmp;
mca_base_param_reg_int(c, "force_rsh",
"Force the launcher to always use rsh",
false, false, false, &tmp);
mca_plm_rshbase_component.force_rsh = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "priority",
"Priority of the rshbase plm component",
false, false, 5,
&mca_plm_rshbase_component.priority);
return ORTE_SUCCESS;
}
static int rshbase_component_query(mca_base_module_t **module, int *priority)
{
/* see if MCA-specified agent (default: ssh:rsh) is available */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
/* this isn't an error - we just cannot be selected */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: unable to be used: cannot find path "
"for launching agent \"%s\"\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_rsh_agent));
*module = NULL;
return ORTE_ERROR;
}
/* we are good - make ourselves available */
*priority = mca_plm_rshbase_component.priority;
*module = (mca_base_module_t *) &orte_plm_rshbase_module;
return ORTE_SUCCESS;
}
static int rshbase_component_close(void)
{
/* cleanup state */
OBJ_DESTRUCT(&mca_plm_rshbase_component.lock);
OBJ_DESTRUCT(&mca_plm_rshbase_component.cond);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,576 +0,0 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#include <string.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
#ifdef HAVE_SYS_SELECT_H
#include <sys/select.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include <fcntl.h>
#include <signal.h>
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/mca/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/bit_ops.h"
#include "opal/util/if.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/plm_base_rsh_support.h"
#include "orte/mca/plm/rshbase/plm_rshbase.h"
static int init(void);
static int spawn(orte_job_t *jdata);
static int terminate_orteds(void);
static int signal_job(orte_jobid_t jobid, int32_t signal);
static int finalize(void);
orte_plm_base_module_t orte_plm_rshbase_module = {
init,
orte_plm_base_set_hnp_name,
spawn,
NULL,
orte_plm_base_orted_terminate_job,
terminate_orteds,
orte_plm_base_orted_kill_local_procs,
signal_job,
finalize
};
/* local global storage of timing variables */
static struct timeval joblaunchstart, joblaunchstop;
/* local global storage */
static int num_in_progress=0;
/**
* Init the module
*/
static int init(void)
{
int rc;
/* we were selected, so setup the launch agent */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Callback on daemon exit.
*/
static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
orte_std_cntr_t cnt=1;
uint8_t flag;
orte_job_t *jdata;
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
/* if we are not the HNP, send a message to the HNP alerting it
* to the failure
*/
if (!ORTE_PROC_IS_HNP) {
opal_buffer_t buf;
orte_vpid_t *vpid=(orte_vpid_t*)cbdata;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s daemon %d failed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)*vpid, WEXITSTATUS(status)));
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR);
flag = 1;
opal_dss.pack(&buf, &flag, 1, OPAL_UINT8);
opal_dss.pack(&buf, vpid, 1, ORTE_VPID);
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0);
OBJ_DESTRUCT(&buf);
} else {
orte_proc_t *daemon=(orte_proc_t*)cbdata;
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s daemon %d failed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)daemon->name.vpid, WEXITSTATUS(status)));
/* note that this daemon failed */
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
/* increment the #daemons terminated so we will exit properly */
jdata->num_terminated++;
/* report that the daemon has failed so we can exit */
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF, pid, status);
}
}
/* release any waiting threads */
num_in_progress--;
OPAL_THREAD_LOCK(&mca_plm_rshbase_component.lock);
if (num_in_progress <= mca_plm_rshbase_component.num_concurrent) {
opal_condition_signal(&mca_plm_rshbase_component.cond);
}
OPAL_THREAD_UNLOCK(&mca_plm_rshbase_component.lock);
}
/**
* Launch a daemon (bootproxy) on each node. The daemon will be responsible
* for launching the application.
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int spawn(orte_job_t *jdata)
{
int rc;
orte_job_map_t *map;
orte_app_context_t *app;
orte_node_t *node;
int nnode;
int argc;
char **argv=NULL, **nodes=NULL, *nodelist=NULL;
char *prefix_dir;
int node_name_index1;
int proc_vpid_index;
pid_t pid;
bool failed_launch = true;
orte_jobid_t active_job, failed_job;
/* wait for the launch to complete */
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
while (orte_plm_globals.spawn_in_progress) {
opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn"));
orte_plm_globals.spawn_in_progress = true;
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will
* directly ssh the slave process itself. No mapping
* is performed to support this - the caller must
* provide all the info required to launch the job,
* including the target hosts
*/
rc = orte_plm_base_local_slave_launch(jdata);
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
orte_plm_globals.spawn_in_progress = false;
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
return rc;
}
/* default to declaring the daemon launch as having failed */
failed_job = ORTE_PROC_MY_NAME->jobid;
/* if we are timing, record the start time */
if (orte_timing) {
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
joblaunchstart = orte_plm_globals.daemonlaunchstart;
}
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: launching job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* set the active jobid */
active_job = jdata->jobid;
/* Get the map for this job */
if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
if (0 == map->num_new_daemons) {
/* have all the daemons we need - launch app */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto launch_apps;
}
if ((0 < opal_output_get_verbosity(orte_plm_globals.output) ||
orte_leave_session_attached) &&
mca_plm_rshbase_component.num_concurrent < map->num_new_daemons) {
/**
* If we are in '--debug-daemons' we keep the ssh connection
* alive for the span of the run. If we use this option
* AND we launch on more than "num_concurrent" machines
* then we will deadlock. No connections are terminated
* until the job is complete, no job is started
* since all the orteds are waiting for all the others
* to come online, and the others ore not launched because
* we are waiting on those that have started to terminate
* their ssh tunnels. :(
* As we cannot run in this situation, pretty print the error
* and return an error code.
*/
orte_show_help("help-plm-rsh.txt", "deadlock-params",
true, mca_plm_rshbase_component.num_concurrent, map->num_new_daemons);
rc = ORTE_ERR_FATAL;
goto cleanup;
}
/*
* After a discussion between Ralph & Jeff, we concluded that we
* really are handling the prefix dir option incorrectly. It currently
* is associated with an app_context, yet it really refers to the
* location where OpenRTE/Open MPI is installed on a NODE. Fixing
* this right now would involve significant change to orterun as well
* as elsewhere, so we will intentionally leave this incorrect at this
* point. The error, however, is identical to that seen in all prior
* releases of OpenRTE/Open MPI, so our behavior is no worse than before.
*
* A note to fix this, along with ideas on how to do so, has been filed
* on the project's Trac system under "feature enhancement".
*
* For now, default to the prefix_dir provided in the first app_context.
* Since there always MUST be at least one app_context, we are safe in
* doing this.
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
/* if we are using static ports, then setup a string showing the
* nodes so we can use a regex to pass connection info
*/
if (orte_static_ports) {
nodelist = NULL;
for (nnode=0; nnode < map->nodes->size; nnode++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
continue;
}
/* if this is me, then don't include it - I'm already present
* in the cmd line options
*/
if (0 == strcmp(node->name, orte_process_info.nodename) || opal_ifislocal(node->name)) {
continue;
}
opal_argv_append_nosize(&nodes, node->name);
}
if (0 < opal_argv_count(nodes)) {
nodelist = opal_argv_join(nodes, ',');
}
opal_argv_free(nodes);
}
/* we also need at least one node name so we can check what shell is
* being used, if we have to
*/
node = NULL;
for (nnode = 0; nnode < map->nodes->size; nnode++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
break;
}
}
if (NULL == node) {
/* well, if there isn't even one node in the map, then we are hammered */
rc = ORTE_ERR_FATAL;
goto cleanup;
}
prefix_dir = app->prefix_dir;
/* setup the launch */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_setup_launch(&argc, &argv, node->name, &node_name_index1,
&proc_vpid_index, prefix_dir, nodelist))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
if (NULL != nodelist) {
free(nodelist);
nodelist = NULL;
}
}
if (NULL != nodelist) {
free(nodelist);
nodelist = NULL;
}
/* set the active jobid */
active_job = jdata->jobid;
/*
* Iterate through each of the nodes
*/
for (nnode=0; nnode < map->nodes->size; nnode++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
continue;
}
/* if this daemon already exists, don't launch it! */
if (node->daemon_launched) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase:launch daemon already exists on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
continue;
}
/* if the node's daemon has not been defined, then we
* have an error!
*/
if (NULL == node->daemon) {
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase:launch daemon failed to be defined on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
rc = ORTE_ERR_FATAL;
goto cleanup;
}
/* setup node name */
free(argv[node_name_index1]);
if (NULL != node->username &&
0 != strlen (node->username)) {
asprintf (&argv[node_name_index1], "%s@%s",
node->username, node->name);
} else {
argv[node_name_index1] = strdup(node->name);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
/* fork a child to exec the rsh/ssh session */
pid = fork();
if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
rc = ORTE_ERR_SYS_LIMITS_CHILDREN;
goto cleanup;
}
/* child */
if (pid == 0) {
/* do the ssh launch - this will exit if it fails */
orte_plm_base_ssh_child(argc, argv, node->daemon->name.vpid, proc_vpid_index);
} else { /* father */
/* indicate this daemon has been launched */
node->daemon->state = ORTE_PROC_STATE_LAUNCHED;
/* record the pid */
node->daemon->pid = pid;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: recording launch of daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name)));
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)node->daemon);
OPAL_THREAD_LOCK(&mca_plm_rshbase_component.lock);
/* This situation can lead to a deadlock if '--debug-daemons' is set.
* However, the deadlock condition is tested at the begining of this
* function, so we're quite confident it should not happens here.
*/
if (num_in_progress++ >= mca_plm_rshbase_component.num_concurrent) {
opal_condition_wait(&mca_plm_rshbase_component.cond, &mca_plm_rshbase_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_plm_rshbase_component.lock);
}
}
/* wait for daemons to callback */
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: daemon launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
launch_apps:
/* if we get here, then the daemons succeeded, so any failure would now be
* for the application job
*/
failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rshbase: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
/* wait for the launch to complete */
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
while (!orte_plm_globals.spawn_complete) {
opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid)));
orte_plm_globals.spawn_in_progress = false;
opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond);
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
/* get here if launch went okay */
failed_launch = false;
if (orte_timing ) {
if (0 != gettimeofday(&joblaunchstop, NULL)) {
opal_output(0, "plm:rshbase: could not obtain job launch stop time");
} else {
opal_output(0, "plm:rshbase: total job launch time is %ld usec",
(joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(joblaunchstop.tv_usec - joblaunchstart.tv_usec));
}
}
cleanup:
if (NULL != argv) {
opal_argv_free(argv);
}
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;
}
/**
* Terminate the orteds for a given job
*/
static int terminate_orteds(void)
{
int rc;
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}
static int signal_job(orte_jobid_t jobid, int32_t signal)
{
int rc;
/* order them to pass this signal to their local procs */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
static int finalize(void)
{
int rc;
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}

Просмотреть файл

@ -74,7 +74,6 @@
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/plm_base_rsh_support.h"
#include "plm_slurm.h"
@ -113,7 +112,6 @@ static pid_t primary_srun_pid = 0;
static bool primary_pid_set = false;
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
static bool launching_daemons;
static bool local_launch_available = false;
/**
* Init the module
@ -126,10 +124,6 @@ static int plm_slurm_init(void)
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS == orte_plm_base_rsh_launch_agent_setup(NULL, NULL)) {
local_launch_available = true;
}
return rc;
}
@ -173,22 +167,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will
* directly ssh the slave process itself. No mapping
* is performed to support this - the caller must
* provide all the info required to launch the job,
* including the target hosts
*/
if (!local_launch_available) {
/* if we can't support this, then abort */
orte_show_help("help-plm-slurm.txt", "no-local-slave-support", true);
return ORTE_ERR_FAILED_TO_START;
}
return orte_plm_base_local_slave_launch(jdata);
}
/* flag the daemons as failing by default */
failed_job = ORTE_PROC_MY_NAME->jobid;

Просмотреть файл

@ -95,7 +95,6 @@ static void failed_start(int fd, short event, void *arg);
* Local "global" variables
*/
static opal_event_t *ev=NULL;
static bool local_launch_available = false;
/*
* Global variable
@ -123,10 +122,6 @@ static int plm_tm_init(void)
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS == orte_plm_base_rsh_launch_agent_setup(orte_rsh_agent, NULL)) {
local_launch_available = true;
}
return rc;
}
@ -167,22 +162,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will
* directly ssh the slave process itself. No mapping
* is performed to support this - the caller must
* provide all the info required to launch the job,
* including the target hosts
*/
if (!local_launch_available) {
/* if we can't support this, then abort */
orte_show_help("help-plm-tm.txt", "no-local-slave-support", true);
return ORTE_ERR_FAILED_TO_START;
}
return orte_plm_base_local_slave_launch(jdata);
}
/* if we are timing, record the start time */
if (orte_timing) {
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* Copyright (c) 2007-201 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -87,7 +87,7 @@ static orte_process_name_t local_lifeline;
static int num_children;
static opal_list_t my_children;
static bool ack_recvd;
static bool hnp_direct=true;
static int init(void)
{
@ -237,13 +237,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if the job family is zero, then this is going to a local slave,
* so the path is direct and there is nothing to do here
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_binomial_update: %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -251,6 +244,16 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
/* if I am a daemon and the target is my HNP, then check
* the route - if it isn't direct, then we just flag that
* we have a route to the HNP
*/
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
hnp_direct = false;
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
@ -372,14 +375,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/****** HNP AND DAEMONS ONLY ******/
/* if the job family is zero, then this is going to a local slave,
* so the path is direct
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
ret = target;
goto found;
}
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
@ -412,7 +407,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (orte_static_ports) {
if (!hnp_direct || orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -648,15 +643,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
"%s routed_binomial: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if this is for a job family of zero, then we know that the enclosed
* procs are local slaves to our daemon. In that case, we can just ignore this
* as our daemon - given that it had to spawn the local slave - already
* knows how to talk to them
*/
if (0 == ORTE_JOB_FAMILY(job)) {
return ORTE_SUCCESS;
}
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
@ -1074,7 +1060,8 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
opal_list_item_t *item;
orte_routed_tree_t *child, *nm;
orte_routed_tree_t *child;
orte_namelist_t *nm;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -1091,10 +1078,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = child->vpid;
opal_bitmap_copy(&nm->relatives, &child->relatives);
opal_list_append(children, &nm->super);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = child->vpid;
opal_list_append(children, &nm->item);
}
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -288,11 +288,8 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
opal_output(0, "%s CALL TO UPDATE ROUTE FOR OWN JOB FAMILY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERR_NOT_SUPPORTED;
/* THIS CAME FROM OUR OWN JOB FAMILY...ignore it */
return ORTE_SUCCESS;
}
@ -646,15 +643,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
"%s routed_cm: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if this is for a job family of zero, then we know that the enclosed
* procs are local slaves to our daemon. In that case, we can just ignore this
* as our daemon - given that it had to spawn the local slave - already
* knows how to talk to them
*/
if (0 == ORTE_JOB_FAMILY(job)) {
return ORTE_SUCCESS;
}
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
@ -846,7 +834,7 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_routed_tree_t *nm;
orte_namelist_t *nm;
int32_t i;
orte_job_t *jdata;
orte_proc_t *proc;
@ -886,10 +874,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
ORTE_NAME_PRINT(&(proc->name)),
proc->state));
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = proc->name.vpid;
opal_bitmap_clear_all_bits(&nm->relatives);
opal_list_append(children, &nm->super);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = proc->name.jobid;
nm->name.vpid = proc->name.vpid;
opal_list_append(children, &nm->item);
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -330,7 +330,7 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_vpid_t i;
orte_routed_tree_t *nm;
orte_namelist_t *nm;
if (!ORTE_PROC_IS_HNP) {
/* if I am not the HNP, there is nothing to do */
@ -341,9 +341,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
* daemons so I can relay messages to them
*/
for (i=0; i < orte_process_info.num_procs; i++) {
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = i;
opal_list_append(children, &nm->super);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = i;
opal_list_append(children, &nm->item);
}
return ORTE_VPID_INVALID;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -83,7 +83,7 @@ static opal_mutex_t lock;
static orte_process_name_t *lifeline=NULL;
static orte_process_name_t local_lifeline;
static bool ack_recvd;
static bool hnp_direct=true;
static int init(void)
@ -222,13 +222,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if the job family is zero, then this is going to a local slave,
* so the path is direct and there is nothing to do here
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_linear_update: %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -236,6 +229,16 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
/* if I am a daemon and the target is my HNP, then check
* the route - if it isn't direct, then we just flag that
* we have a route to the HNP
*/
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
hnp_direct = false;
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
@ -334,14 +337,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/****** HNP AND DAEMONS ONLY ******/
/* if the job family is zero, then this is going to a local slave,
* so the path is direct
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
ret = target;
goto found;
}
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
@ -375,7 +370,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (orte_static_ports) {
if (!hnp_direct || orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -594,15 +589,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
"%s routed_linear: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if this is for a job family of zero, then we know that the enclosed
* procs are local slaves to our daemon. In that case, we can just ignore this
* as our daemon - given that it had to spawn the local slave - already
* knows how to talk to them
*/
if (0 == ORTE_JOB_FAMILY(job)) {
return ORTE_SUCCESS;
}
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
@ -792,8 +778,7 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_routed_tree_t *nm;
orte_vpid_t v;
orte_namelist_t *nm;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -809,14 +794,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
if (NULL != children &&
ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) {
/* my child is just the vpid+1 daemon */
nm = OBJ_NEW(orte_routed_tree_t);
opal_bitmap_init(&nm->relatives, orte_process_info.num_procs);
nm->vpid = ORTE_PROC_MY_NAME->vpid + 1;
/* my relatives are everyone above that point */
for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) {
opal_bitmap_set_bit(&nm->relatives, v);
}
opal_list_append(children, &nm->super);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = ORTE_PROC_MY_NAME->vpid + 1;
opal_list_append(children, &nm->item);
}
if (ORTE_PROC_IS_HNP) {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* Copyright (c) 2007-2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -86,7 +86,7 @@ static orte_process_name_t local_lifeline;
static int num_children;
static opal_list_t my_children;
static bool ack_recvd;
static bool hnp_direct=true;
static int init(void)
{
@ -235,13 +235,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if the job family is zero, then this is going to a local slave,
* so the path is direct and there is nothing to do here
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_radix_update: %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -249,6 +242,16 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
/* if I am a daemon and the target is my HNP, then check
* the route - if it isn't direct, then we just flag that
* we have a route to the HNP
*/
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
hnp_direct = false;
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
@ -368,14 +371,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/****** HNP AND DAEMONS ONLY ******/
/* if the job family is zero, then this is going to a local slave,
* so the path is direct
*/
if (0 == ORTE_JOB_FAMILY(target->jobid)) {
ret = target;
goto found;
}
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
@ -409,7 +404,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (orte_static_ports) {
if (!hnp_direct || orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -635,15 +630,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
"%s routed_radix: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if this is for a job family of zero, then we know that the enclosed
* procs are local slaves to our daemon. In that case, we can just ignore this
* as our daemon - given that it had to spawn the local slave - already
* knows how to talk to them
*/
if (0 == ORTE_JOB_FAMILY(job)) {
return ORTE_SUCCESS;
}
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
@ -1002,7 +988,8 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
opal_list_item_t *item;
orte_routed_tree_t *child, *nm;
orte_routed_tree_t *child;
orte_namelist_t *nm;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -1019,10 +1006,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = child->vpid;
opal_bitmap_copy(&nm->relatives, &child->relatives);
opal_list_append(children, &nm->super);
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = child->vpid;
opal_list_append(children, &nm->item);
}
}
/* return my parent's vpid */

Просмотреть файл

@ -207,7 +207,8 @@ typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid);
* Fills the provided list with the direct children of this process
* in the routing tree, and returns the vpid of the parent. Only valid
* when called by a daemon or the HNP. Passing a NULL pointer will result
* in onlly the parent vpid being returned.
* in only the parent vpid being returned. The returned list will be filled
* with orte_namelist_t items.
*/
typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *children);

Просмотреть файл

@ -1,37 +0,0 @@
#
# Copyright (c) 2007 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
routed_slave.h \
routed_slave.c \
routed_slave_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_routed_slave_DSO
component_noinst =
component_install = mca_routed_slave.la
else
component_noinst = libmca_routed_slave.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_routed_slave_la_SOURCES = $(sources)
mca_routed_slave_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_routed_slave_la_SOURCES = $(sources)
libmca_routed_slave_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_routed_slave_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_routed_slave_CONFIG], [
AC_CONFIG_FILES([orte/mca/routed/slave/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

Просмотреть файл

@ -1,352 +0,0 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/threads/condition.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/base/base.h"
#include "routed_slave.h"
static int init(void);
static int finalize(void);
static int delete_route(orte_process_name_t *proc);
static int update_route(orte_process_name_t *target,
orte_process_name_t *route);
static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
static size_t num_routes(void);
#if OPAL_ENABLE_FT_CR == 1
static int slave_ft_event(int state);
#endif
orte_routed_module_t orte_routed_slave_module = {
init,
finalize,
delete_route,
update_route,
get_route,
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_tree,
get_routing_tree,
get_wireup_info,
num_routes,
#if OPAL_ENABLE_FT_CR == 1
slave_ft_event
#else
NULL
#endif
};
/* local globals */
static opal_condition_t cond;
static opal_mutex_t lock;
static orte_process_name_t *lifeline=NULL;
static orte_process_name_t local_lifeline;
static int init(void)
{
/* setup the global condition and lock */
OBJ_CONSTRUCT(&cond, opal_condition_t);
OBJ_CONSTRUCT(&lock, opal_mutex_t);
lifeline = NULL;
return ORTE_SUCCESS;
}
static int finalize(void)
{
/* destruct the global condition and lock */
OBJ_DESTRUCT(&cond);
OBJ_DESTRUCT(&lock);
lifeline = NULL;
return ORTE_SUCCESS;
}
static int delete_route(orte_process_name_t *proc)
{
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_slave_delete_route for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/*There is nothing to do here */
return ORTE_SUCCESS;
}
static int update_route(orte_process_name_t *target,
orte_process_name_t *route)
{
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_slave_update: %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(target),
ORTE_NAME_PRINT(route)));
/*There is nothing to do here */
return ORTE_SUCCESS;
}
static orte_process_name_t get_route(orte_process_name_t *target)
{
orte_process_name_t *ret;
#if ORTE_ENABLE_EPOCH
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
#else
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
#endif
ret = ORTE_NAME_INVALID;
goto found;
}
if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) {
ret = ORTE_NAME_INVALID;
goto found;
}
/* a slave must always route via its parent daemon */
ret = ORTE_PROC_MY_DAEMON;
found:
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routed_slave_get(%s) --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(target),
ORTE_NAME_PRINT(ret)));
return *ret;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
int rc;
opal_buffer_t buf;
if (NULL != ndat) {
/* if ndat != NULL, then this is being invoked by the proc to
* init a route to a specified process that is outside of our
* job family. It really doesn't matter to a slave, though, as
* everything has to go through our parent daemon, who must
* already know how to reach the specified proc since the
* inbound message had to go through it!
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_slave: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_SUCCESS;
}
/* if ndat=NULL, then we are being called during orte_init. In this
* case, we need to setup a few critical pieces of info
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_slave: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job),
(NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri,
(NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri));
if (NULL == orte_process_info.my_daemon_uri) {
/* in this module, we absolutely MUST have this information - if
* we didn't get it, then error out
*/
opal_output(0, "%s ERROR: Failed to identify the local daemon's URI",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output(0, "%s ERROR: This is a fatal condition when the slave router",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output(0, "%s ERROR: has been selected - either select the unity router",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output(0, "%s ERROR: or ensure that the local daemon info is provided",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERR_FATAL;
}
/* we have to set the HNP's name, even though we won't route messages directly
* to it. This is required to ensure that we -do- send messages to the correct
* HNP name
*/
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
ORTE_PROC_MY_HNP, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Set the contact info in the RML - this won't actually establish
* the connection, but just tells the RML how to reach the daemon
* if/when we attempt to send to it
*/
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_daemon_uri))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
/* extract the daemon's name so we can update the routing table */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
ORTE_PROC_MY_DAEMON, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set our lifeline to the local daemon - we will abort if this connection is lost */
lifeline = ORTE_PROC_MY_DAEMON;
/* send a message back to our daemon letting it know we are alive. This allows the
* daemon to "block" in spawn until we are running
*/
OBJ_CONSTRUCT(&buf, opal_buffer_t);
orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0);
OBJ_DESTRUCT(&buf);
/* no answer is expected or coming */
return ORTE_SUCCESS;
}
static int route_lost(const orte_process_name_t *route)
{
/* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort.
* NOTE: we cannot call abort from here as the OOB needs to first
* release a thread-lock - otherwise, we will hang!!
*/
if (!orte_finalizing &&
NULL != lifeline &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
return ORTE_ERR_FATAL;
}
/* we don't care about this one, so return success */
return ORTE_SUCCESS;
}
static bool route_is_defined(const orte_process_name_t *target)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/* only the route to my daemon is defined */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, target, ORTE_PROC_MY_DAEMON)) {
return false;
}
return true;
}
static int set_lifeline(orte_process_name_t *proc)
{
/* we have to copy the proc data because there is no
* guarantee that it will be preserved
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline));
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int update_routing_tree(orte_jobid_t jobid)
{
/* this is a meaningless command for a slave as I am not allowed to route */
return ORTE_ERR_NOT_SUPPORTED;
}
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
/* this is a meaningless command for a slave as I am not allowed to route */
return ORTE_VPID_INVALID;
}
static int get_wireup_info(opal_buffer_t *buf)
{
/* this is a meaningless command for a slave as I am not allowed to route */
return ORTE_ERR_NOT_SUPPORTED;
}
static size_t num_routes(void)
{
return 0;
}
#if OPAL_ENABLE_FT_CR == 1
static int slave_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* Re-exchange the routes
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
cleanup:
return exit_status;
}
#endif

Просмотреть файл

@ -1,26 +0,0 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_ROUTED_SLAVE_H
#define MCA_ROUTED_SLAVE_H
#include "orte_config.h"
#include "orte/mca/routed/routed.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_routed_component_t mca_routed_slave_component;
extern orte_routed_module_t orte_routed_slave_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,55 +0,0 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/routed/base/base.h"
#include "routed_slave.h"
static int orte_routed_slave_component_query(mca_base_module_t **module, int *priority);
/**
* component definition
*/
orte_routed_component_t mca_routed_slave_component = {
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
ORTE_ROUTED_BASE_VERSION_2_0_0,
"slave", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
NULL,
NULL,
orte_routed_slave_component_query
},
{
/* This component can be checkpointed */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int orte_routed_slave_component_query(mca_base_module_t **module, int *priority)
{
/* allow selection only when specifically requested */
*priority = 0;
*module = (mca_base_module_t *) &orte_routed_slave_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
@ -94,8 +94,7 @@ static void send_relay(opal_buffer_t *buf)
{
opal_list_t recips;
opal_list_item_t *item;
orte_routed_tree_t *nm;
orte_process_name_t target;
orte_namelist_t *nm;
int ret;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
@ -116,26 +115,24 @@ static void send_relay(opal_buffer_t *buf)
}
/* send the message to each recipient on list, deconstructing it as we go */
target.jobid = ORTE_PROC_MY_NAME->jobid;
while (NULL != (item = opal_list_remove_first(&recips))) {
nm = (orte_routed_tree_t*)item;
nm = (orte_namelist_t*)item;
target.vpid = nm->vpid;
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
if (!PROC_IS_RUNNING(&target)) {
if (!PROC_IS_RUNNING(&nm->name)) {
continue;
}
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s orte:daemon:send_relay sending relay msg to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&target)));
ORTE_NAME_PRINT(&nm->name)));
if (ORTE_SUCCESS != (ret = orte_comm(&target, buf, ORTE_RML_TAG_DAEMON,
orte_daemon_cmd_processor))) {
if (ORTE_SUCCESS != (ret = orte_comm(&nm->name, buf, ORTE_RML_TAG_DAEMON,
orte_daemon_cmd_processor))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}

Просмотреть файл

@ -206,6 +206,15 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
static void rml_cbfunc(int status,
struct orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
OBJ_RELEASE(buffer);
}
int orte_daemon(int argc, char *argv[])
{
int ret = 0;
@ -539,6 +548,39 @@ int orte_daemon(int argc, char *argv[])
opal_event_add(&pipe_handler, NULL);
}
/* If I have a parent, then save his contact info so
* any messages we send can flow thru him.
*/
mca_base_param_reg_string_name("orte", "parent_uri",
"URI for the parent if tree launch is enabled.",
true, false, NULL, &rml_uri);
if (NULL != rml_uri) {
orte_process_name_t parent;
/* set the contact info into the hash table */
if (ORTE_SUCCESS != (ret = orte_rml.set_contact_info(rml_uri))) {
ORTE_ERROR_LOG(ret);
free(rml_uri);
OBJ_RELEASE(buffer);
goto DONE;
}
ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL );
if( ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
free(rml_uri);
OBJ_RELEASE(buffer);
goto DONE;
}
free(rml_uri);
/* tell the routed module that we have a path
* back to the HNP
*/
if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
ORTE_ERROR_LOG(ret);
goto DONE;
}
}
/* if we are not the HNP...the only time we will be an HNP
* is if we are launched by a singleton to provide support
* for it
@ -612,62 +654,29 @@ int orte_daemon(int argc, char *argv[])
}
}
/* If I have a parent, then first let him know my URI, and only
* after report back to the HNP.
*/
mca_base_param_reg_string_name("orte", "parent_uri",
"URI for the parent if tree launch is enabled.",
true, false, NULL, &rml_uri);
if (NULL != rml_uri) {
orte_process_name_t parent;
/* set the contact info into the hash table */
if (ORTE_SUCCESS != (ret = orte_rml.set_contact_info(rml_uri))) {
ORTE_ERROR_LOG(ret);
free(rml_uri);
OBJ_RELEASE(buffer);
goto DONE;
}
ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL );
if( ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
free(rml_uri);
OBJ_RELEASE(buffer);
goto DONE;
}
free(rml_uri);
if( 0 > (ret = orte_rml.send_buffer(&parent, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0)) ) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
goto DONE;
}
} else {
/* include our node name */
opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
/* include our node name */
opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
#if OPAL_HAVE_HWLOC
/* add the local topology */
if (NULL != opal_hwloc_topology &&
(1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(ret);
}
/* add the local topology */
if (NULL != opal_hwloc_topology &&
(1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(ret);
}
}
#endif
/* send to the HNP's callback - this will flow up the routing
* tree if static ports are enabled
*/
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
goto DONE;
}
}
OBJ_RELEASE(buffer); /* done with this */
/* send to the HNP's callback - this will flow up the routing
* tree if static ports are enabled
*/
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0,
rml_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
goto DONE;
}
}
if (orte_debug_daemons_flag) {

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -71,9 +73,6 @@ int orte_finalize(void)
if (NULL != orte_launch_agent) {
free(orte_launch_agent);
}
if (NULL != orte_rsh_agent) {
free(orte_rsh_agent);
}
if( NULL != orte_default_hostfile ) {
free(orte_default_hostfile);
}

Просмотреть файл

@ -138,10 +138,6 @@ char *orte_xterm;
/* whether or not to forward SIGTSTP and SIGCONT signals */
bool orte_forward_job_control;
/* rsh support */
char *orte_rsh_agent = NULL;
bool orte_assume_same_shell = true;
/* report launch progress */
bool orte_report_launch_progress = false;

Просмотреть файл

@ -606,10 +606,6 @@ ORTE_DECLSPEC extern char *orte_output_filename;
/* generate new xterm windows to display output from specified ranks */
ORTE_DECLSPEC extern char *orte_xterm;
/* rsh support */
ORTE_DECLSPEC extern char *orte_rsh_agent;
ORTE_DECLSPEC extern bool orte_assume_same_shell;
/* whether or not to report launch progress */
ORTE_DECLSPEC extern bool orte_report_launch_progress;

Просмотреть файл

@ -43,9 +43,6 @@ int orte_register_params(void)
{
int value;
char *strval;
#if !ORTE_DISABLE_FULL_SUPPORT
int tmp;
#endif
/* only go thru this once - mpirun calls it twice, which causes
* any error messages to show up twice
@ -355,21 +352,6 @@ int orte_register_params(void)
(int) false, &value);
orte_forward_job_control = OPAL_INT_TO_BOOL(value);
/* local rsh/ssh launch agent */
tmp = mca_base_param_reg_string_name("orte", "rsh_agent",
"The command used to launch executables on remote nodes (typically either \"ssh\" or \"rsh\")",
false, false, "ssh : rsh", NULL);
mca_base_param_reg_syn_name(tmp, "pls", "rsh_agent", true);
mca_base_param_reg_syn_name(tmp, "plm", "rsh_agent", true);
mca_base_param_lookup_string(tmp, &orte_rsh_agent);
tmp = mca_base_param_reg_int_name("orte", "assume_same_shell",
"If set to 1, assume that the shell on the remote node is the same as the shell on the local node. Otherwise, probe for what the remote shell [default: 1]",
false, false, 1, NULL);
mca_base_param_reg_syn_name(tmp, "plm", "rsh_assume_same_shell", true);
mca_base_param_lookup_int(tmp, &value);
orte_assume_same_shell = OPAL_INT_TO_BOOL(value);
/* whether or not to report launch progress */
mca_base_param_reg_int_name("orte", "report_launch_progress",
"Output a brief periodic report on launch progress [default: no]",

Просмотреть файл

@ -11,6 +11,8 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -24,7 +26,6 @@ EXTRA_DIST += tools/CMakeLists.txt
# orte/Makefile.am
SUBDIRS += \
tools/orte-bootproxy \
tools/orte-checkpoint \
tools/orte-clean \
tools/orte-ps \
@ -37,7 +38,6 @@ SUBDIRS += \
tools/orte-migrate
DIST_SUBDIRS += \
tools/orte-bootproxy \
tools/orte-checkpoint \
tools/orte-clean \
tools/orte-ps \

Просмотреть файл

@ -1,27 +0,0 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
if !ORTE_DISABLE_FULL_SUPPORT
if OMPI_INSTALL_BINARIES
dist_bin_SCRIPTS = orte-bootproxy.sh
endif # OMPI_INSTALL_BINARIES
endif # !ORTE_DISABLE_FULL_SUPPORT

Просмотреть файл

@ -1,91 +0,0 @@
#!/bin/sh
#
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights reserved
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
if (( $# < 1 )) ; then
echo "orte-bootproxy.sh: for OMPI internal use only"
exit 1
fi
# take the first arg
var=$1
# if the var is CLEANUP, then we are in cleanup mode
if [ "${var}" = "CLEANUP" ]; then
shift 1
var=$1
if [ -n "${var}" ] && [ "${var}" = "APPS" ]; then
# kill specified apps
shift 1
var=$1
# get the process table
psout=`ps`
# cycle through and look for the specified apps
while [ -n "${var}" ] && [ "${var}" != "FILES" ]; do
testvar=`echo "${psout}" | grep "${var}"`
if [ -n "${testvar}" ]; then
# echo "killall" "${var}"
killall -TERM "${var}"
fi
shift 1
var=$1
done
if [ -n "${var}" ]; then
shift 1
var=$1
# remove specified files
while [ -n "${var}" ]; do
if [ -e "${var}" ]; then
# echo "rm" "${var}"
rm -f "${var}"
fi
shift 1
var=$1
done
fi
elif [ "${var}" = "FILES" ]; then
# remove specified files
shift 1
var=$1
while [ -n "${var}" ]; do
if [ -e "${var}" ]; then
# echo "rm" "${var}"
rm -f "${var}"
fi
shift 1
var=$1
done
fi
# remove any session directories from this user
# sdir="${TMPDIR}""openmpi-sessions-""${USER}""@"`hostname`"_0"
sdir="/tmp/openmpi-sessions-""${USER}""@"`hostname`"_0"
if [ -e "${sdir}" ]; then
# echo "rm" "${sdir}"
rm -rf "${sdir}"
fi
exit 0
fi
# push all MCA params to the environment
while [ "${var:0:5}" = "OMPI_" ]; do
if [ "${var:5:6}" = "PREFIX" ]; then
export LD_LIBRARY_PATH="${var:12}"/lib:$LD_LIBRARY_PATH
export PATH="${var:12}"/bin:$PATH
elif [ "${var:5:4}" = "WDIR" ]; then
cd "${var:10}"
else
export $var
fi
shift 1
var=$1
done
# extract the application to be executed
app=$1
shift 1
#exec the app with the remaining args
#echo "executing" "$app"
exec "$app" "$@"