After more discussion on the phone, it seems easier to not muck around
in special components but rather go down to a /tmp branch. So removing these components and I'll branch next. This commit was SVN r10771.
Этот коммит содержится в:
родитель
d00e6e29e8
Коммит
ef8433a60b
@ -1,3 +0,0 @@
|
||||
rhc
|
||||
jsquyres
|
||||
Ralph
|
@ -1,56 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(pls_tbird_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-pls-tbird.txt
|
||||
|
||||
sources = \
|
||||
pls_tbird.h \
|
||||
pls_tbird_component.c \
|
||||
pls_tbird_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pls_tbird_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_pls_tbird.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_pls_tbird.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_pls_tbird_la_SOURCES = $(component_sources)
|
||||
mca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS)
|
||||
mca_pls_tbird_la_LIBADD = \
|
||||
$(pls_tbird_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_pls_tbird_la_SOURCES = $(lib_sources)
|
||||
libmca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS)
|
||||
libmca_pls_tbird_la_LIBADD = $(pls_tbird_LIBS)
|
@ -1,38 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_tbird_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_tbird_CONFIG],[
|
||||
OMPI_CHECK_TM([pls_tbird], [pls_tbird_good=1], [pls_tbird_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$pls_tbird_good" = "1"],
|
||||
[pls_tbird_WRAPPER_EXTRA_LDFLAGS="$pls_tbird_LDFLAGS"
|
||||
pls_tbird_WRAPPER_EXTRA_LIBS="$pls_tbird_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([pls_tbird_CPPFLAGS])
|
||||
AC_SUBST([pls_tbird_LDFLAGS])
|
||||
AC_SUBST([pls_tbird_LIBS])
|
||||
])dnl
|
@ -1,22 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=src/pls_tbird_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,44 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[multiple-prefixes]
|
||||
Multiple different --prefix options were specified to mpirun for the
|
||||
same node. This is a fatal error for the TM (PBS / Torque) process
|
||||
starter in Open MPI.
|
||||
|
||||
The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[daemon-not-found]
|
||||
The TM (PBS / Torqus) process starter in Open MPI was unable to find
|
||||
its daemon executable (orted) on the node where mpirun was executed.
|
||||
|
||||
This sanity check is performed because the back-end PBS / Torque
|
||||
process launcher does not provide any kind of error to Open MPI if it
|
||||
tries to launch its daemon on a remote node, but the daemon cannot be
|
||||
found. Open MPI's check for the daemon locally is somewhat of a lame
|
||||
workaround / sanity check.
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Try to add the Open MPI executables to your PATH
|
||||
2. Use the --prefix option to mpirun to indicate where Open MPI can
|
||||
find its executables
|
||||
3. Set the MCA parameter "pls_tm_want_path_check" to 0
|
||||
4. Talk to your local system administration
|
@ -1,52 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLS_TM_EXPORT_H
|
||||
#define ORTE_PLS_TM_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct orte_pls_tbird_component_t {
|
||||
orte_pls_base_component_t super;
|
||||
int priority;
|
||||
int debug;
|
||||
int verbose;
|
||||
bool want_path_check;
|
||||
char *orted;
|
||||
char **checked_paths;
|
||||
};
|
||||
typedef struct orte_pls_tbird_component_t orte_pls_tbird_component_t;
|
||||
|
||||
/* Globally exported variables */
|
||||
OMPI_COMP_EXPORT extern orte_pls_tbird_component_t mca_pls_tbird_component;
|
||||
extern orte_pls_base_module_1_0_0_t orte_pls_tbird_module;
|
||||
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_PLS_TM_EXPORT_H */
|
@ -1,144 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "pls_tbird.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the pls ompi_tbird component version number
|
||||
*/
|
||||
const char *mca_pls_tbird_component_version_string =
|
||||
"Open MPI tbird pls MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int pls_tbird_open(void);
|
||||
static int pls_tbird_close(void);
|
||||
static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_pls_tbird_component_t mca_pls_tbird_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a pls v1.0.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
ORTE_PLS_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"tbird",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
pls_tbird_open,
|
||||
pls_tbird_close,
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
true
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
pls_tbird_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int pls_tbird_open(void)
|
||||
{
|
||||
int tbirdp;
|
||||
mca_base_component_t *comp = &mca_pls_tbird_component.super.pls_version;
|
||||
|
||||
mca_base_param_reg_int(comp, "debug", "Enable debugging of the TBIRD pls",
|
||||
false, false, 0, &mca_pls_tbird_component.debug);
|
||||
mca_base_param_reg_int(comp, "verbose", "Enable verbose output of the TBIRD pls",
|
||||
false, false, 0, &mca_pls_tbird_component.verbose);
|
||||
|
||||
mca_base_param_reg_int(comp, "priority", "Default selection priority",
|
||||
false, false, 75, &mca_pls_tbird_component.priority);
|
||||
|
||||
mca_base_param_reg_string(comp, "orted",
|
||||
"Command to use to start proxy orted",
|
||||
false, false, "orted",
|
||||
&mca_pls_tbird_component.orted);
|
||||
mca_base_param_reg_int(comp, "want_path_check",
|
||||
"Whether the launching process should check for the pls_tbird_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
|
||||
false, false, (int) true, &tbirdp);
|
||||
mca_pls_tbird_component.want_path_check = (bool) tbirdp;
|
||||
|
||||
mca_pls_tbird_component.checked_paths = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tbird_close(void)
|
||||
{
|
||||
if (NULL != mca_pls_tbird_component.checked_paths) {
|
||||
opal_argv_free(mca_pls_tbird_component.checked_paths);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority)
|
||||
{
|
||||
/* Are we running under a TM job? */
|
||||
|
||||
if (NULL != getenv("PBS_ENVIRONMENT") &&
|
||||
NULL != getenv("PBS_JOBID")) {
|
||||
*priority = mca_pls_tbird_component.priority;
|
||||
return &orte_pls_tbird_module;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
|
||||
opal_output(orte_pls_base.pls_output,
|
||||
"pls:tbird: NOT available for selection");
|
||||
return NULL;
|
||||
}
|
@ -1,717 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#if HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#ifdef HAVE_SCHED_H
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <tbird.h>
|
||||
|
||||
#include "opal/install_dirs.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "pls_tbird.h"
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int pls_tbird_launch(orte_jobid_t jobid);
|
||||
static int pls_tbird_terminate_job(orte_jobid_t jobid);
|
||||
static int pls_tbird_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_tbird_finalize(void);
|
||||
|
||||
static int pls_tbird_connect(void);
|
||||
static int pls_tbird_disconnect(void);
|
||||
static int pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env);
|
||||
static int pls_tbird_check_path(char *exe, char **env);
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_pls_base_module_1_0_0_t orte_pls_tbird_module = {
|
||||
pls_tbird_launch,
|
||||
pls_tbird_terminate_job,
|
||||
pls_tbird_terminate_proc,
|
||||
pls_tbird_signal_job,
|
||||
pls_tbird_signal_proc,
|
||||
pls_tbird_finalize
|
||||
};
|
||||
|
||||
|
||||
extern char **environ;
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_launch(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_t mapping;
|
||||
opal_list_item_t *m_item, *n_item;
|
||||
size_t num_nodes;
|
||||
orte_vpid_t vpid;
|
||||
int node_name_index;
|
||||
int proc_name_index;
|
||||
char *jobid_string;
|
||||
char *uri, *param;
|
||||
char **argv;
|
||||
int argc;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
int launched = 0, i;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
/* Query the list of nodes allocated and mapped to this job.
|
||||
* We need the entire mapping for a couple of reasons:
|
||||
* - need the prefix to start with.
|
||||
* - need to know if we are launching on a subset of the allocated nodes
|
||||
*/
|
||||
OBJ_CONSTRUCT(&mapping, opal_list_t);
|
||||
rc = orte_rmaps_base_get_map(jobid, &mapping);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
num_nodes = 0;
|
||||
for(m_item = opal_list_get_first(&mapping);
|
||||
m_item != opal_list_get_end(&mapping);
|
||||
m_item = opal_list_get_next(m_item)) {
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
|
||||
num_nodes += opal_list_get_size(&map->nodes);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a range of vpids for the daemons.
|
||||
*/
|
||||
if (num_nodes == 0) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* need integer value for command line parameter */
|
||||
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
argv = opal_argv_split(mca_pls_tbird_component.orted, ' ');
|
||||
argc = opal_argv_count(argv);
|
||||
|
||||
opal_argv_append(&argc, &argv, "--no-daemonize");
|
||||
|
||||
/* check for debug flags */
|
||||
orte_pls_base_proxy_mca_argv(&argc, &argv);
|
||||
|
||||
/* proxy information */
|
||||
opal_argv_append(&argc, &argv, "--bootproxy");
|
||||
opal_argv_append(&argc, &argv, jobid_string);
|
||||
opal_argv_append(&argc, &argv, "--name");
|
||||
proc_name_index = argc;
|
||||
opal_argv_append(&argc, &argv, "");
|
||||
|
||||
/* tell the daemon how many procs are in the daemon's job */
|
||||
opal_argv_append(&argc, &argv, "--num_procs");
|
||||
asprintf(¶m, "%lu", (unsigned long)(vpid + num_nodes));
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(param);
|
||||
|
||||
/* tell the daemon the starting vpid of the daemon's job */
|
||||
opal_argv_append(&argc, &argv, "--vpid_start");
|
||||
opal_argv_append(&argc, &argv, "0");
|
||||
|
||||
opal_argv_append(&argc, &argv, "--nodename");
|
||||
node_name_index = argc;
|
||||
opal_argv_append(&argc, &argv, "");
|
||||
|
||||
/* pass along the universe name and location info */
|
||||
opal_argv_append(&argc, &argv, "--universe");
|
||||
asprintf(¶m, "%s@%s:%s", orte_universe_info.uid,
|
||||
orte_universe_info.host, orte_universe_info.name);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(param);
|
||||
|
||||
/* setup ns contact info */
|
||||
opal_argv_append(&argc, &argv, "--nsreplica");
|
||||
if (NULL != orte_process_info.ns_replica_uri) {
|
||||
uri = strdup(orte_process_info.ns_replica_uri);
|
||||
} else {
|
||||
uri = orte_rml.get_uri();
|
||||
}
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
free(param);
|
||||
|
||||
/* setup gpr contact info */
|
||||
opal_argv_append(&argc, &argv, "--gprreplica");
|
||||
if (NULL != orte_process_info.gpr_replica_uri) {
|
||||
uri = strdup(orte_process_info.gpr_replica_uri);
|
||||
} else {
|
||||
uri = orte_rml.get_uri();
|
||||
}
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
opal_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
free(param);
|
||||
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
if (NULL != param) {
|
||||
opal_output(0, "pls:tbird: final top-level argv:");
|
||||
opal_output(0, "pls:tbird: %s", param);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
|
||||
rc = pls_tbird_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
connected = true;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in pls_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
|
||||
lib_base = opal_basename(OPAL_LIBDIR);
|
||||
bin_base = opal_basename(OPAL_BINDIR);
|
||||
|
||||
/*
|
||||
* iterate through each of the contexts
|
||||
*/
|
||||
for (m_item = opal_list_get_first(&mapping);
|
||||
m_item != opal_list_get_end(&mapping);
|
||||
m_item = opal_list_get_next(m_item)) {
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
|
||||
char** env;
|
||||
char* var;
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(environ);
|
||||
var = mca_base_param_environ_variable("seed",NULL,NULL);
|
||||
opal_setenv(var, "0", true, &env);
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables. */
|
||||
if (NULL != map->app->prefix_dir) {
|
||||
int i;
|
||||
char *newenv;
|
||||
|
||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
map->app->prefix_dir, bin_base, env[i] + 5);
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
opal_output(0, "pls:tbird: resetting PATH: %s",
|
||||
newenv);
|
||||
}
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
map->app->prefix_dir, lib_base, env[i] + 16);
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
opal_output(0, "pls:tbird: resetting LD_LIBRARY_PATH: %s",
|
||||
newenv);
|
||||
}
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Do a quick sanity check to ensure that we can find the
|
||||
orted in the PATH */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = pls_tbird_check_path(argv[0], env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_show_help("help-pls-tbird.txt", "daemon-not-found",
|
||||
true, argv[0]);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
for (n_item = opal_list_get_first(&map->nodes);
|
||||
n_item != opal_list_get_end(&map->nodes);
|
||||
n_item = opal_list_get_next(n_item)) {
|
||||
orte_rmaps_base_node_t* rmaps_node = (orte_rmaps_base_node_t*)n_item;
|
||||
orte_ras_node_t* node = rmaps_node->node;
|
||||
orte_process_name_t* name;
|
||||
char* name_string;
|
||||
|
||||
/* already launched on this node */
|
||||
if (0 != node->node_launched++) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* setup node name */
|
||||
argv[node_name_index] = node->node_name;
|
||||
|
||||
/* initialize daemons process name */
|
||||
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup per-node options */
|
||||
if (mca_pls_tbird_component.debug ||
|
||||
mca_pls_tbird_component.verbose) {
|
||||
opal_output(0, "pls:tbird: launching on node %s",
|
||||
node->node_name);
|
||||
}
|
||||
|
||||
/* setup process name */
|
||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tbird: unable to create process name");
|
||||
return rc;
|
||||
}
|
||||
argv[proc_name_index] = name_string;
|
||||
|
||||
/* set the progress engine schedule for this node.
|
||||
* if node_slots is set to zero, then we default to
|
||||
* NOT being oversubscribed
|
||||
*/
|
||||
if (node->node_slots > 0 &&
|
||||
opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
opal_output(0, "pls:tbird: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
|
||||
node->node_slots,
|
||||
opal_list_get_size(&rmaps_node->node_procs));
|
||||
}
|
||||
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||
opal_setenv(var, "1", true, &env);
|
||||
} else {
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
opal_output(0, "pls:tbird: not oversubscribed -- setting mpi_yield_when_idle to 0");
|
||||
}
|
||||
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||
opal_setenv(var, "0", true, &env);
|
||||
}
|
||||
free(var);
|
||||
|
||||
/* save the daemons name on the node */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* exec the daemon */
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
if (NULL != param) {
|
||||
opal_output(0, "pls:tbird: executing: %s", param);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
|
||||
rc = pls_tbird_start_proc(node->node_name, argc, argv, env);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tbird: start_procs returned error %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
launched++;
|
||||
vpid++;
|
||||
free(name);
|
||||
opal_event_loop(OPAL_EVLOOP_NONBLOCK);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* loop through all those that are launched and poll for
|
||||
completion status */
|
||||
|
||||
for(i = 0; i < launched; i++){
|
||||
int ret, local_err;
|
||||
tm_event_t event;
|
||||
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != ret) {
|
||||
errno = local_err;
|
||||
opal_output(0, "pls:tbird: failed to start a proc error %d", ret);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (connected) {
|
||||
pls_tbird_disconnect();
|
||||
}
|
||||
|
||||
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
|
||||
OBJ_RELEASE(m_item);
|
||||
}
|
||||
OBJ_DESTRUCT(&mapping);
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
if (NULL != bin_base) {
|
||||
free(bin_base);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
return orte_pls_base_proxy_terminate_job(jobid);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* TM can't kill individual processes -- PBS will kill the entire job
|
||||
*/
|
||||
static int
|
||||
pls_tbird_terminate_proc(const orte_process_name_t *name)
|
||||
{
|
||||
opal_output(orte_pls_base.pls_output,
|
||||
"pls:tbird:terminate_proc: not supported");
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_job(jobid, signal);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
{
|
||||
return orte_pls_base_proxy_signal_proc(name, signal);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free stuff
|
||||
*/
|
||||
static int
|
||||
pls_tbird_finalize(void)
|
||||
{
|
||||
/* cleanup any pending recvs */
|
||||
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_CLNT);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_connect(void)
|
||||
{
|
||||
int ret;
|
||||
struct tm_roots tm_root;
|
||||
int count, progress;
|
||||
|
||||
/* try a couple times to connect - might get busy signals every
|
||||
now and then */
|
||||
for (count = 0 ; count < 10; ++count) {
|
||||
ret = tm_init(NULL, &tbird_root);
|
||||
if (TM_SUCCESS == ret) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
for (progress = 0 ; progress < 10 ; ++progress) {
|
||||
opal_progress();
|
||||
#if HAVE_SCHED_YIELD
|
||||
sched_yield();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_disconnect(void)
|
||||
{
|
||||
tm_finalize();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static char **tbird_hostnames = NULL;
|
||||
static tm_node_id *tbird_node_ids = NULL;
|
||||
static int num_tbird_hostnames, num_node_ids;
|
||||
|
||||
|
||||
|
||||
/* we don't call this anymore */
|
||||
/*
|
||||
* For a given TM node ID, get the string hostname corresponding to
|
||||
* it.
|
||||
*/
|
||||
static char*
|
||||
get_tbird_hostname(tbird_node_id node)
|
||||
{
|
||||
char *hostname;
|
||||
char buffer[256];
|
||||
int ret, local_errno;
|
||||
tm_event_t event;
|
||||
char **argv;
|
||||
|
||||
/* Get the info string corresponding to this TM node ID */
|
||||
|
||||
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Now wait for that event to happen */
|
||||
|
||||
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* According to the TM man page, we get back a space-separated
|
||||
string array. The hostname is the second item. Use a cheap
|
||||
trick to get it. */
|
||||
|
||||
buffer[sizeof(buffer) - 1] = '\0';
|
||||
argv = opal_argv_split(buffer, ' ');
|
||||
if (NULL == argv) {
|
||||
return NULL;
|
||||
}
|
||||
hostname = strdup(argv[1]);
|
||||
opal_argv_free(argv);
|
||||
|
||||
/* All done */
|
||||
|
||||
return hostname;
|
||||
}
|
||||
|
||||
|
||||
/* we don't call this anymore!*/
|
||||
static int
|
||||
query_tbird_hostnames(void)
|
||||
{
|
||||
char *h;
|
||||
int i, ret;
|
||||
|
||||
/* Get the list of nodes allocated in this PBS job */
|
||||
|
||||
ret = tm_nodeinfo(&tbird_node_ids, &num_node_ids);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
|
||||
there may be multiple "TM nodes" that correspond to the same
|
||||
physical node. This doesn't really affect what we're doing
|
||||
here (we actually ignore the fact that they're duplicates --
|
||||
slightly inefficient, but no big deal); just mentioned for
|
||||
completeness... */
|
||||
|
||||
tm_hostnames = NULL;
|
||||
num_tbird_hostnames = 0;
|
||||
for (i = 0; i < num_node_ids; ++i) {
|
||||
h = get_tbird_hostname(tbird_node_ids[i]);
|
||||
opal_argv_append(&num_tbird_hostnames, &tbird_hostnames, h);
|
||||
free(h);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* we don't call this anymore! */
|
||||
static int
|
||||
do_tbird_resolve(char *hostname, tm_node_id *tnodeid)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
/* Have we already queried TM for all the node info? */
|
||||
if (NULL == tm_hostnames) {
|
||||
ret = query_tbird_hostnames();
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find the TM ID of the hostname that we're looking for */
|
||||
for (i = 0; i < num_tbird_hostnames; ++i) {
|
||||
if (0 == strcmp(hostname, tm_hostnames[i])) {
|
||||
*tnodeid = tm_node_ids[i];
|
||||
opal_output(orte_pls_base.pls_output,
|
||||
"pls:tbird:launch: resolved host %s to node ID %d",
|
||||
hostname, tm_node_ids[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
if (i < num_tbird_hostnames) {
|
||||
ret = ORTE_SUCCESS;
|
||||
} else {
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env)
|
||||
{
|
||||
int ret;
|
||||
tm_node_id node_id;
|
||||
tm_task_id task_id;
|
||||
tm_event_t event;
|
||||
|
||||
/* get the tbird node id for this node */
|
||||
ret = do_tbird_resolve(nodename, &node_id);
|
||||
if (ORTE_SUCCESS != ret) return ret;
|
||||
|
||||
ret = tm_spawn(argc, argv, env, node_id, &task_id, &event);
|
||||
if (TM_SUCCESS != ret) return ORTE_ERROR;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tbird_check_path(char *exe, char **env)
|
||||
{
|
||||
static int size = 256;
|
||||
int i;
|
||||
char *file;
|
||||
char *cwd;
|
||||
char *path = NULL;
|
||||
|
||||
/* Do we want this check at all? */
|
||||
|
||||
if (!mca_pls_tbird_component.want_path_check) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Find the path in the supplied environment */
|
||||
|
||||
for (i = 0; NULL != env[i]; ++i) {
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
path = strdup(env[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == env[i]) {
|
||||
path = strdup("NULL");
|
||||
}
|
||||
|
||||
/* Check the already-successful paths (i.e., be a little
|
||||
friendlier to the filesystem -- if we find the executable
|
||||
successfully, save it) */
|
||||
|
||||
for (i = 0; NULL != mca_pls_tbird_component.checked_paths &&
|
||||
NULL != mca_pls_tbird_component.checked_paths[i]; ++i) {
|
||||
if (0 == strcmp(path, mca_pls_tbird_component.checked_paths[i])) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't already find it, so check now. First, get the cwd. */
|
||||
|
||||
do {
|
||||
cwd = malloc(size);
|
||||
if (NULL == cwd) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == getcwd(cwd, size)) {
|
||||
free(cwd);
|
||||
if (ERANGE == errno) {
|
||||
size *= 2;
|
||||
} else {
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
|
||||
/* Now do the search */
|
||||
|
||||
file = opal_path_findv(exe, X_OK, env, cwd);
|
||||
free(cwd);
|
||||
if (NULL == file) {
|
||||
free(path);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (mca_pls_tbird_component.debug) {
|
||||
opal_output(0, "pls:tbird: found %s", file);
|
||||
}
|
||||
free(file);
|
||||
|
||||
/* Success -- so cache it */
|
||||
|
||||
opal_argv_append_nosize(&mca_pls_tbird_component.checked_paths, path);
|
||||
|
||||
/* All done */
|
||||
|
||||
free(path);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
rhc
|
||||
jsquyres
|
||||
Ralph
|
@ -1,51 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_rmgr_tbird_DSO
|
||||
component_noinst =
|
||||
component_install = mca_rmgr_tbird.la
|
||||
else
|
||||
component_noinst = libmca_rmgr_tbird.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
proxy_SOURCES = \
|
||||
rmgr_tbird.c \
|
||||
rmgr_tbird.h \
|
||||
rmgr_tbird_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_rmgr_tbird_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_rmgr_tbird_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_rmgr_tbird_la_LIBADD =
|
||||
libmca_rmgr_tbird_la_LDFLAGS = -module -avoid-version
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=rmgr_tbird.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,557 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rds/base/base.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
|
||||
#include "orte/mca/rmgr/tbird/rmgr_tbird.h"
|
||||
|
||||
|
||||
static int orte_rmgr_tbird_query(void);
|
||||
|
||||
static int orte_rmgr_tbird_create(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid);
|
||||
|
||||
static int orte_rmgr_tbird_allocate(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_tbird_deallocate(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_tbird_map(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_tbird_launch(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_tbird_terminate_job(
|
||||
orte_jobid_t jobid);
|
||||
|
||||
static int orte_rmgr_tbird_terminate_proc(
|
||||
const orte_process_name_t* proc_name);
|
||||
|
||||
static int orte_rmgr_tbird_signal_job(
|
||||
orte_jobid_t jobid, int32_t signal);
|
||||
|
||||
static int orte_rmgr_tbird_signal_proc(
|
||||
const orte_process_name_t* proc_name,
|
||||
int32_t signal);
|
||||
|
||||
static int orte_rmgr_tbird_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid,
|
||||
orte_rmgr_cb_fn_t cbfn,
|
||||
orte_proc_state_t cb_conditions);
|
||||
|
||||
static int orte_rmgr_tbird_finalize(void);
|
||||
|
||||
|
||||
orte_rmgr_base_module_t orte_rmgr_tbird_module = {
|
||||
orte_rmgr_tbird_query,
|
||||
orte_rmgr_tbird_create,
|
||||
orte_rmgr_tbird_allocate,
|
||||
orte_rmgr_tbird_deallocate,
|
||||
orte_rmgr_tbird_map,
|
||||
orte_rmgr_tbird_launch,
|
||||
orte_rmgr_tbird_terminate_job,
|
||||
orte_rmgr_tbird_terminate_proc,
|
||||
orte_rmgr_tbird_signal_job,
|
||||
orte_rmgr_tbird_signal_proc,
|
||||
orte_rmgr_tbird_spawn,
|
||||
orte_rmgr_base_proc_stage_gate_init,
|
||||
orte_rmgr_base_proc_stage_gate_mgr,
|
||||
orte_rmgr_tbird_finalize
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Resource discovery
|
||||
*/
|
||||
|
||||
static int orte_rmgr_tbird_query(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_rds_base_query())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create the job segment and initialize the application context.
|
||||
*/
|
||||
|
||||
static int orte_rmgr_tbird_create(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* allocate a jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* create and initialize job segment */ /* JJH C/N mapping before this */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_rmgr_base_put_app_context(*jobid, app_context,
|
||||
num_context))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_tbird_allocate(orte_jobid_t jobid)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
return orte_ras_base_allocate(jobid, &mca_rmgr_tbird_component.tbird_ras);
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_ras->deallocate(jobid);
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_map(orte_jobid_t jobid)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_rmaps->map(jobid);
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_launch(orte_jobid_t jobid)
|
||||
{
|
||||
int ret, ret2;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = mca_rmgr_tbird_component.tbird_pls->launch(jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED);
|
||||
if (ORTE_SUCCESS != ret2) {
|
||||
ORTE_ERROR_LOG(ret2);
|
||||
return ret2;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
int ret;
|
||||
orte_jobid_t my_jobid;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
/* if our jobid is the one we're trying to kill AND we're a
|
||||
singleton, then calling the tbird_pls isn't going to be able
|
||||
to do anything. Just call exit. */
|
||||
if (orte_process_info.singleton && jobid == my_jobid) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_pls->terminate_job(jobid);
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_terminate_proc(const orte_process_name_t* proc_name)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
||||
orte_process_info.my_name)) &&
|
||||
(orte_process_info.singleton)) {
|
||||
/* if we're trying to get ourselves killed and we're a
|
||||
singleton, calling terminate_proc isn't going to work
|
||||
properly -- there's no pls setup properly for us. Just
|
||||
call exit and be done. */
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_pls->terminate_proc(proc_name);
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_tbird_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int ret;
|
||||
orte_jobid_t my_jobid;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
/** if our jobid is the one we're trying to signal AND we're a
|
||||
* singleton, then calling the tbird_pls isn't going to be able
|
||||
* to do anything - we already have the signal! */
|
||||
if (orte_process_info.singleton && jobid == my_jobid) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_pls->signal_job(jobid, signal);
|
||||
}
|
||||
|
||||
static int orte_rmgr_tbird_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
||||
orte_process_info.my_name)) &&
|
||||
(orte_process_info.singleton)) {
|
||||
/** if we're trying to signal ourselves and we're a
|
||||
* singleton, calling signal_proc isn't going to work
|
||||
* properly -- there's no pls setup properly for us. Besides, we
|
||||
* already have the signal!
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
return mca_rmgr_tbird_component.tbird_pls->signal_proc(proc_name, signal);
|
||||
}
|
||||
|
||||
|
||||
static void orte_rmgr_tbird_wireup_stdin(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
orte_process_name_t* name;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, jobid, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_iof.iof_push(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDIN, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void orte_rmgr_tbird_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
||||
{
|
||||
orte_rmgr_cb_fn_t cbfunc;
|
||||
union {
|
||||
orte_rmgr_cb_fn_t func;
|
||||
void * ptr;
|
||||
} cbfunc_union;
|
||||
orte_gpr_value_t **values, *value;
|
||||
orte_gpr_keyval_t** keyvals;
|
||||
orte_jobid_t jobid;
|
||||
size_t i, j, k;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* stupid ISO C forbids conversion of object pointer to function
|
||||
pointer. So we do this, which is the same thing, but without
|
||||
the warning from GCC */
|
||||
cbfunc_union.ptr = cbdata;
|
||||
cbfunc = cbfunc_union.func;
|
||||
|
||||
/* we made sure in the subscriptions that at least one
|
||||
* value is always returned
|
||||
* get the jobid from the segment name in the first value
|
||||
*/
|
||||
values = (orte_gpr_value_t**)(data->values)->addr;
|
||||
if (ORTE_SUCCESS != (rc =
|
||||
orte_schema.extract_jobid_from_segment_name(&jobid,
|
||||
values[0]->segment))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
for(i = 0, k=0; k < data->cnt &&
|
||||
i < (data->values)->size; i++) {
|
||||
if (NULL != values[i]) {
|
||||
k++;
|
||||
value = values[i];
|
||||
/* determine the state change */
|
||||
keyvals = value->keyvals;
|
||||
for(j=0; j<value->cnt; j++) {
|
||||
orte_gpr_keyval_t* keyval = keyvals[j];
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_INIT);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG2);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG3) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG3);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_FINALIZED) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_FINALIZED);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_TERMINATED) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_TERMINATED);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_NUM_ABORTED) == 0) {
|
||||
(*cbfunc)(jobid,ORTE_PROC_STATE_ABORTED);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* define a callback point for completing the wireup of the stdin for io forwarding
|
||||
*/
|
||||
static void orte_rmgr_tbird_wireup_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
||||
{
|
||||
orte_gpr_value_t **values;
|
||||
orte_jobid_t jobid;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* we made sure in the subscriptions that at least one
|
||||
* value is always returned
|
||||
* get the jobid from the segment name in the first value
|
||||
*/
|
||||
values = (orte_gpr_value_t**)(data->values)->addr;
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_segment_name(&jobid, values[0]->segment))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
orte_rmgr_tbird_wireup_stdin(jobid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Shortcut for the multiple steps involved in spawning a new job.
|
||||
*/
|
||||
|
||||
|
||||
static int orte_rmgr_tbird_spawn(
|
||||
orte_app_context_t** app_context,
|
||||
size_t num_context,
|
||||
orte_jobid_t* jobid,
|
||||
orte_rmgr_cb_fn_t cbfunc,
|
||||
orte_proc_state_t cb_conditions)
|
||||
{
|
||||
int rc;
|
||||
orte_process_name_t* name;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/*
|
||||
* Perform resource discovery.
|
||||
*/
|
||||
if (mca_rmgr_tbird_component.tbird_rds == false &&
|
||||
ORTE_SUCCESS != (rc = orte_rds_base_query())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
} else {
|
||||
mca_rmgr_tbird_component.tbird_rds = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize job segment and allocate resources
|
||||
*/ /* JJH Insert C/N mapping stuff here */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_rmgr_tbird_create(app_context,num_context,jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_allocate(*jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_map(*jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* setup I/O forwarding
|
||||
*/
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, *jobid, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDOUT, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDERR, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup the launch system's stage gate counters and subscriptions */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_rmgr_base_proc_stage_gate_init(*jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/** setup the subscription so we can complete the wireup when all processes reach LAUNCHED */
|
||||
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_wireup_callback, NULL, ORTE_PROC_STATE_LAUNCHED);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* setup callback
|
||||
*/
|
||||
|
||||
if(NULL != cbfunc) {
|
||||
union {
|
||||
orte_rmgr_cb_fn_t func;
|
||||
void * ptr;
|
||||
} cbfunc_union;
|
||||
void *cbdata;
|
||||
|
||||
/* stupid ISO C forbids conversion of object pointer to function
|
||||
pointer. So we do this, which is the same thing, but without
|
||||
the warning from GCC */
|
||||
cbfunc_union.func = cbfunc;
|
||||
cbdata = cbfunc_union.ptr;
|
||||
|
||||
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_callback, cbdata, cb_conditions);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* launch the job
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_launch(*jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
orte_ns.free_name(&name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmgr_tbird_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/**
|
||||
* Finalize Process Launch Subsystem (PLS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_finalize())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize Resource Mapping Subsystem (RMAPS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_finalize())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize Resource Allocation Subsystem (RAS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_finalize())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize Resource Discovery Subsystem (RDS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rds_base_finalize())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Cancel pending receive. */
|
||||
|
||||
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_SVC);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,60 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Universal Resource Manager (tbird)
|
||||
*/
|
||||
#ifndef ORTE_RMGR_tbird_H
|
||||
#define ORTE_RMGR_tbird_H
|
||||
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* tbird component structure -- add some stuff beyond what is in the
|
||||
* normal rmgr component.
|
||||
*/
|
||||
struct orte_rmgr_tbird_component_t {
|
||||
/** Base rmgr component */
|
||||
orte_rmgr_base_component_t super;
|
||||
/** Has RDS query been called */
|
||||
bool tbird_rds;
|
||||
/** Selected ras module */
|
||||
orte_ras_base_module_t *tbird_ras;
|
||||
/** Selected rmaps module */
|
||||
orte_rmaps_base_module_t *tbird_rmaps;
|
||||
/** Selected pls module */
|
||||
orte_pls_base_module_t *tbird_pls;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_rmgr_tbird_component_t orte_rmgr_tbird_component_t;
|
||||
|
||||
/** Global tbird component */
|
||||
OMPI_COMP_EXPORT extern orte_rmgr_tbird_component_t mca_rmgr_tbird_component;
|
||||
/** Global tbird module */
|
||||
OMPI_COMP_EXPORT extern orte_rmgr_base_module_t orte_rmgr_tbird_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,251 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/rds/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "rmgr_tbird.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_rmgr_tbird_open(void);
|
||||
static int orte_rmgr_tbird_close(void);
|
||||
static orte_rmgr_base_module_t* orte_rmgr_tbird_init(int *priority);
|
||||
|
||||
|
||||
orte_rmgr_tbird_component_t mca_rmgr_tbird_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a iof v1.0.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RMGR_BASE_VERSION_1_0_0,
|
||||
|
||||
"tbird", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmgr_tbird_open, /* component open */
|
||||
orte_rmgr_tbird_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
orte_rmgr_tbird_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_rmgr_tbird_open(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/**
|
||||
* Open Resource Discovery Subsystem (RDS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rds_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open Resource Allocation Subsystem (RAS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open Resource Mapping Subsystem (RMAPS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open Process Launch Subsystem (PLS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void orte_rmgr_tbird_recv(
|
||||
int status,
|
||||
orte_process_name_t* peer,
|
||||
orte_buffer_t* req,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t rsp;
|
||||
OBJ_CONSTRUCT(&rsp, orte_buffer_t);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_base_cmd_dispatch(req,&rsp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = orte_rml.send_buffer(peer, &rsp, ORTE_RML_TAG_RMGR_CLNT, 0);
|
||||
if (rc < 0) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&rsp);
|
||||
}
|
||||
|
||||
|
||||
static orte_rmgr_base_module_t *orte_rmgr_tbird_init(int* priority)
|
||||
{
|
||||
int rc;
|
||||
char* pls = NULL;
|
||||
if(orte_process_info.seed == false) {
|
||||
/* if we are bootproxy - need to be selected */
|
||||
int id = mca_base_param_register_int("rmgr","bootproxy","jobid",NULL,0);
|
||||
int jobid = 0;
|
||||
mca_base_param_lookup_int(id,&jobid);
|
||||
if(jobid == 0) {
|
||||
return NULL;
|
||||
}
|
||||
/* use fork pls for bootproxy */
|
||||
id = mca_base_param_register_string("rmgr","bootproxy","pls",NULL,"fork");
|
||||
mca_base_param_lookup_string(id,&pls);
|
||||
}
|
||||
|
||||
/**
|
||||
* Select RDS components.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rds_base_select())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
mca_rmgr_tbird_component.tbird_rds = false;
|
||||
|
||||
/**
|
||||
* Find available RAS components
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_find_available())) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select RMAPS component
|
||||
*/
|
||||
if (NULL == (mca_rmgr_tbird_component.tbird_rmaps = orte_rmaps_base_select(NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select PLS component
|
||||
*/
|
||||
if (NULL == (mca_rmgr_tbird_component.tbird_pls = orte_pls_base_select(pls))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Post non-blocking receive */
|
||||
|
||||
if (0 > (rc = orte_rml.recv_buffer_nb(
|
||||
ORTE_RML_NAME_ANY,
|
||||
ORTE_RML_TAG_RMGR_SVC,
|
||||
ORTE_RML_PERSISTENT,
|
||||
orte_rmgr_tbird_recv,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = 100;
|
||||
return &orte_rmgr_tbird_module;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
static int orte_rmgr_tbird_close(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/**
|
||||
* Close Process Launch Subsystem (PLS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_close())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close Resource Mapping Subsystem (RMAPS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_close())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close Resource Allocation Subsystem (RAS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_close())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close Resource Discovery Subsystem (RDS)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rds_base_close())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user