1
1

After more discussion on the phone, it seems easier to not muck around

in special components but rather go down to a /tmp branch.  So
removing these components and I'll branch next.

This commit was SVN r10771.
Этот коммит содержится в:
Jeff Squyres 2006-07-12 22:12:29 +00:00
родитель d00e6e29e8
Коммит ef8433a60b
16 изменённых файлов: 0 добавлений и 2022 удалений

Просмотреть файл

Просмотреть файл

@ -1,3 +0,0 @@
rhc
jsquyres
Ralph

Просмотреть файл

@ -1,56 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(pls_tbird_CPPFLAGS)
dist_pkgdata_DATA = help-pls-tbird.txt
sources = \
pls_tbird.h \
pls_tbird_component.c \
pls_tbird_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_tbird_DSO
lib =
lib_sources =
component = mca_pls_tbird.la
component_sources = $(sources)
else
lib = libmca_pls_tbird.la
lib_sources = $(sources)
component =
component_sources =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component)
mca_pls_tbird_la_SOURCES = $(component_sources)
mca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS)
mca_pls_tbird_la_LIBADD = \
$(pls_tbird_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(lib)
libmca_pls_tbird_la_SOURCES = $(lib_sources)
libmca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS)
libmca_pls_tbird_la_LIBADD = $(pls_tbird_LIBS)

Просмотреть файл

@ -1,38 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pls_tbird_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_tbird_CONFIG],[
OMPI_CHECK_TM([pls_tbird], [pls_tbird_good=1], [pls_tbird_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$pls_tbird_good" = "1"],
[pls_tbird_WRAPPER_EXTRA_LDFLAGS="$pls_tbird_LDFLAGS"
pls_tbird_WRAPPER_EXTRA_LIBS="$pls_tbird_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([pls_tbird_CPPFLAGS])
AC_SUBST([pls_tbird_LDFLAGS])
AC_SUBST([pls_tbird_LIBS])
])dnl

Просмотреть файл

@ -1,22 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=src/pls_tbird_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,44 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[multiple-prefixes]
Multiple different --prefix options were specified to mpirun for the
same node. This is a fatal error for the TM (PBS / Torque) process
starter in Open MPI.
The first two prefix values supplied for node %s were:
%s
and %s
#
[daemon-not-found]
The TM (PBS / Torqus) process starter in Open MPI was unable to find
its daemon executable (orted) on the node where mpirun was executed.
This sanity check is performed because the back-end PBS / Torque
process launcher does not provide any kind of error to Open MPI if it
tries to launch its daemon on a remote node, but the daemon cannot be
found. Open MPI's check for the daemon locally is somewhat of a lame
workaround / sanity check.
If you do not understand this error mesage, please try the following:
1. Try to add the Open MPI executables to your PATH
2. Use the --prefix option to mpirun to indicate where Open MPI can
find its executables
3. Set the MCA parameter "pls_tm_want_path_check" to 0
4. Talk to your local system administration

Просмотреть файл

@ -1,52 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_PLS_TM_EXPORT_H
#define ORTE_PLS_TM_EXPORT_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/mca/pls/pls.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct orte_pls_tbird_component_t {
orte_pls_base_component_t super;
int priority;
int debug;
int verbose;
bool want_path_check;
char *orted;
char **checked_paths;
};
typedef struct orte_pls_tbird_component_t orte_pls_tbird_component_t;
/* Globally exported variables */
OMPI_COMP_EXPORT extern orte_pls_tbird_component_t mca_pls_tbird_component;
extern orte_pls_base_module_1_0_0_t orte_pls_tbird_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_PLS_TM_EXPORT_H */

Просмотреть файл

@ -1,144 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "orte/orte_constants.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "pls_tbird.h"
/*
* Public string showing the pls ompi_tbird component version number
*/
const char *mca_pls_tbird_component_version_string =
"Open MPI tbird pls MCA component version " ORTE_VERSION;
/*
* Local function
*/
static int pls_tbird_open(void);
static int pls_tbird_close(void);
static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_pls_tbird_component_t mca_pls_tbird_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pls v1.0.0 component (which also
implies a specific MCA version) */
ORTE_PLS_BASE_VERSION_1_0_0,
/* Component name and version */
"tbird",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
pls_tbird_open,
pls_tbird_close,
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
true
},
/* Initialization / querying functions */
pls_tbird_init
}
};
static int pls_tbird_open(void)
{
int tbirdp;
mca_base_component_t *comp = &mca_pls_tbird_component.super.pls_version;
mca_base_param_reg_int(comp, "debug", "Enable debugging of the TBIRD pls",
false, false, 0, &mca_pls_tbird_component.debug);
mca_base_param_reg_int(comp, "verbose", "Enable verbose output of the TBIRD pls",
false, false, 0, &mca_pls_tbird_component.verbose);
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75, &mca_pls_tbird_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_pls_tbird_component.orted);
mca_base_param_reg_int(comp, "want_path_check",
"Whether the launching process should check for the pls_tbird_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)",
false, false, (int) true, &tbirdp);
mca_pls_tbird_component.want_path_check = (bool) tbirdp;
mca_pls_tbird_component.checked_paths = NULL;
return ORTE_SUCCESS;
}
static int pls_tbird_close(void)
{
if (NULL != mca_pls_tbird_component.checked_paths) {
opal_argv_free(mca_pls_tbird_component.checked_paths);
}
return ORTE_SUCCESS;
}
static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority)
{
/* Are we running under a TM job? */
if (NULL != getenv("PBS_ENVIRONMENT") &&
NULL != getenv("PBS_JOBID")) {
*priority = mca_pls_tbird_component.priority;
return &orte_pls_tbird_module;
}
/* Sadly, no */
opal_output(orte_pls_base.pls_output,
"pls:tbird: NOT available for selection");
return NULL;
}

Просмотреть файл

@ -1,717 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#ifdef HAVE_SCHED_H
#include <sched.h>
#endif
#include <errno.h>
#include <tbird.h>
#include "opal/install_dirs.h"
#include "opal/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/show_help.h"
#include "opal/util/path.h"
#include "opal/util/basename.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/runtime/opal_progress.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "pls_tbird.h"
/*
* Local functions
*/
static int pls_tbird_launch(orte_jobid_t jobid);
static int pls_tbird_terminate_job(orte_jobid_t jobid);
static int pls_tbird_terminate_proc(const orte_process_name_t *name);
static int pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal);
static int pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal);
static int pls_tbird_finalize(void);
static int pls_tbird_connect(void);
static int pls_tbird_disconnect(void);
static int pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env);
static int pls_tbird_check_path(char *exe, char **env);
/*
* Global variable
*/
orte_pls_base_module_1_0_0_t orte_pls_tbird_module = {
pls_tbird_launch,
pls_tbird_terminate_job,
pls_tbird_terminate_proc,
pls_tbird_signal_job,
pls_tbird_signal_proc,
pls_tbird_finalize
};
extern char **environ;
static int
pls_tbird_launch(orte_jobid_t jobid)
{
opal_list_t mapping;
opal_list_item_t *m_item, *n_item;
size_t num_nodes;
orte_vpid_t vpid;
int node_name_index;
int proc_name_index;
char *jobid_string;
char *uri, *param;
char **argv;
int argc;
int rc;
bool connected = false;
int launched = 0, i;
char *bin_base = NULL, *lib_base = NULL;
/* Query the list of nodes allocated and mapped to this job.
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
* - need to know if we are launching on a subset of the allocated nodes
*/
OBJ_CONSTRUCT(&mapping, opal_list_t);
rc = orte_rmaps_base_get_map(jobid, &mapping);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
num_nodes = 0;
for(m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
num_nodes += opal_list_get_size(&map->nodes);
}
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
/* need integer value for command line parameter */
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
/* add the daemon command (as specified by user) */
argv = opal_argv_split(mca_pls_tbird_component.orted, ' ');
argc = opal_argv_count(argv);
opal_argv_append(&argc, &argv, "--no-daemonize");
/* check for debug flags */
orte_pls_base_proxy_mca_argv(&argc, &argv);
/* proxy information */
opal_argv_append(&argc, &argv, "--bootproxy");
opal_argv_append(&argc, &argv, jobid_string);
opal_argv_append(&argc, &argv, "--name");
proc_name_index = argc;
opal_argv_append(&argc, &argv, "");
/* tell the daemon how many procs are in the daemon's job */
opal_argv_append(&argc, &argv, "--num_procs");
asprintf(&param, "%lu", (unsigned long)(vpid + num_nodes));
opal_argv_append(&argc, &argv, param);
free(param);
/* tell the daemon the starting vpid of the daemon's job */
opal_argv_append(&argc, &argv, "--vpid_start");
opal_argv_append(&argc, &argv, "0");
opal_argv_append(&argc, &argv, "--nodename");
node_name_index = argc;
opal_argv_append(&argc, &argv, "");
/* pass along the universe name and location info */
opal_argv_append(&argc, &argv, "--universe");
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
opal_argv_append(&argc, &argv, param);
free(param);
/* setup ns contact info */
opal_argv_append(&argc, &argv, "--nsreplica");
if (NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
free(param);
/* setup gpr contact info */
opal_argv_append(&argc, &argv, "--gprreplica");
if (NULL != orte_process_info.gpr_replica_uri) {
uri = strdup(orte_process_info.gpr_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
free(param);
if (mca_pls_tbird_component.debug) {
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:tbird: final top-level argv:");
opal_output(0, "pls:tbird: %s", param);
free(param);
}
}
rc = pls_tbird_connect();
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
connected = true;
/* Figure out the basenames for the libdir and bindir. There is a
lengthy comment about this in pls_rsh_module.c explaining all
the rationale for how / why we're doing this. */
lib_base = opal_basename(OPAL_LIBDIR);
bin_base = opal_basename(OPAL_BINDIR);
/*
* iterate through each of the contexts
*/
for (m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
char** env;
char* var;
/* setup environment */
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed",NULL,NULL);
opal_setenv(var, "0", true, &env);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. */
if (NULL != map->app->prefix_dir) {
int i;
char *newenv;
for (i = 0; NULL != env && NULL != env[i]; ++i) {
/* Reset PATH */
if (0 == strncmp("PATH=", env[i], 5)) {
asprintf(&newenv, "%s/%s:%s",
map->app->prefix_dir, bin_base, env[i] + 5);
if (mca_pls_tbird_component.debug) {
opal_output(0, "pls:tbird: resetting PATH: %s",
newenv);
}
opal_setenv("PATH", newenv, true, &env);
free(newenv);
}
/* Reset LD_LIBRARY_PATH */
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
asprintf(&newenv, "%s/%s:%s",
map->app->prefix_dir, lib_base, env[i] + 16);
if (mca_pls_tbird_component.debug) {
opal_output(0, "pls:tbird: resetting LD_LIBRARY_PATH: %s",
newenv);
}
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
free(newenv);
}
}
}
/* Do a quick sanity check to ensure that we can find the
orted in the PATH */
if (ORTE_SUCCESS !=
(rc = pls_tbird_check_path(argv[0], env))) {
ORTE_ERROR_LOG(rc);
opal_show_help("help-pls-tbird.txt", "daemon-not-found",
true, argv[0]);
goto cleanup;
}
/* Iterate through each of the nodes and spin
* up a daemon.
*/
for (n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) {
orte_rmaps_base_node_t* rmaps_node = (orte_rmaps_base_node_t*)n_item;
orte_ras_node_t* node = rmaps_node->node;
orte_process_name_t* name;
char* name_string;
/* already launched on this node */
if (0 != node->node_launched++) {
continue;
}
/* setup node name */
argv[node_name_index] = node->node_name;
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup per-node options */
if (mca_pls_tbird_component.debug ||
mca_pls_tbird_component.verbose) {
opal_output(0, "pls:tbird: launching on node %s",
node->node_name);
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tbird: unable to create process name");
return rc;
}
argv[proc_name_index] = name_string;
/* set the progress engine schedule for this node.
* if node_slots is set to zero, then we default to
* NOT being oversubscribed
*/
if (node->node_slots > 0 &&
opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
if (mca_pls_tbird_component.debug) {
opal_output(0, "pls:tbird: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
node->node_slots,
opal_list_get_size(&rmaps_node->node_procs));
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "1", true, &env);
} else {
if (mca_pls_tbird_component.debug) {
opal_output(0, "pls:tbird: not oversubscribed -- setting mpi_yield_when_idle to 0");
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "0", true, &env);
}
free(var);
/* save the daemons name on the node */
if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* exec the daemon */
if (mca_pls_tbird_component.debug) {
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:tbird: executing: %s", param);
free(param);
}
}
rc = pls_tbird_start_proc(node->node_name, argc, argv, env);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tbird: start_procs returned error %d", rc);
goto cleanup;
}
launched++;
vpid++;
free(name);
opal_event_loop(OPAL_EVLOOP_NONBLOCK);
}
}
/* loop through all those that are launched and poll for
completion status */
for(i = 0; i < launched; i++){
int ret, local_err;
tm_event_t event;
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != ret) {
errno = local_err;
opal_output(0, "pls:tbird: failed to start a proc error %d", ret);
goto cleanup;
}
}
cleanup:
if (connected) {
pls_tbird_disconnect();
}
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
OBJ_RELEASE(m_item);
}
OBJ_DESTRUCT(&mapping);
if (NULL != lib_base) {
free(lib_base);
}
if (NULL != bin_base) {
free(bin_base);
}
return rc;
}
static int
pls_tbird_terminate_job(orte_jobid_t jobid)
{
return orte_pls_base_proxy_terminate_job(jobid);
}
/*
* TM can't kill individual processes -- PBS will kill the entire job
*/
static int
pls_tbird_terminate_proc(const orte_process_name_t *name)
{
opal_output(orte_pls_base.pls_output,
"pls:tbird:terminate_proc: not supported");
return ORTE_ERR_NOT_SUPPORTED;
}
static int
pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal)
{
return orte_pls_base_proxy_signal_job(jobid, signal);
}
static int
pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal)
{
return orte_pls_base_proxy_signal_proc(name, signal);
}
/*
* Free stuff
*/
static int
pls_tbird_finalize(void)
{
/* cleanup any pending recvs */
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_CLNT);
return ORTE_SUCCESS;
}
static int
pls_tbird_connect(void)
{
int ret;
struct tm_roots tm_root;
int count, progress;
/* try a couple times to connect - might get busy signals every
now and then */
for (count = 0 ; count < 10; ++count) {
ret = tm_init(NULL, &tbird_root);
if (TM_SUCCESS == ret) {
return ORTE_SUCCESS;
}
for (progress = 0 ; progress < 10 ; ++progress) {
opal_progress();
#if HAVE_SCHED_YIELD
sched_yield();
#endif
}
}
return ORTE_ERR_RESOURCE_BUSY;
}
static int
pls_tbird_disconnect(void)
{
tm_finalize();
return ORTE_SUCCESS;
}
static char **tbird_hostnames = NULL;
static tm_node_id *tbird_node_ids = NULL;
static int num_tbird_hostnames, num_node_ids;
/* we don't call this anymore */
/*
* For a given TM node ID, get the string hostname corresponding to
* it.
*/
static char*
get_tbird_hostname(tbird_node_id node)
{
char *hostname;
char buffer[256];
int ret, local_errno;
tm_event_t event;
char **argv;
/* Get the info string corresponding to this TM node ID */
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
if (TM_SUCCESS != ret) {
return NULL;
}
/* Now wait for that event to happen */
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
return NULL;
}
/* According to the TM man page, we get back a space-separated
string array. The hostname is the second item. Use a cheap
trick to get it. */
buffer[sizeof(buffer) - 1] = '\0';
argv = opal_argv_split(buffer, ' ');
if (NULL == argv) {
return NULL;
}
hostname = strdup(argv[1]);
opal_argv_free(argv);
/* All done */
return hostname;
}
/* we don't call this anymore!*/
static int
query_tbird_hostnames(void)
{
char *h;
int i, ret;
/* Get the list of nodes allocated in this PBS job */
ret = tm_nodeinfo(&tbird_node_ids, &num_node_ids);
if (TM_SUCCESS != ret) {
return ORTE_ERR_NOT_FOUND;
}
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
there may be multiple "TM nodes" that correspond to the same
physical node. This doesn't really affect what we're doing
here (we actually ignore the fact that they're duplicates --
slightly inefficient, but no big deal); just mentioned for
completeness... */
tm_hostnames = NULL;
num_tbird_hostnames = 0;
for (i = 0; i < num_node_ids; ++i) {
h = get_tbird_hostname(tbird_node_ids[i]);
opal_argv_append(&num_tbird_hostnames, &tbird_hostnames, h);
free(h);
}
/* All done */
return ORTE_SUCCESS;
}
/* we don't call this anymore! */
static int
do_tbird_resolve(char *hostname, tm_node_id *tnodeid)
{
int i, ret;
/* Have we already queried TM for all the node info? */
if (NULL == tm_hostnames) {
ret = query_tbird_hostnames();
if (ORTE_SUCCESS != ret) {
return ret;
}
}
/* Find the TM ID of the hostname that we're looking for */
for (i = 0; i < num_tbird_hostnames; ++i) {
if (0 == strcmp(hostname, tm_hostnames[i])) {
*tnodeid = tm_node_ids[i];
opal_output(orte_pls_base.pls_output,
"pls:tbird:launch: resolved host %s to node ID %d",
hostname, tm_node_ids[i]);
break;
}
}
/* All done */
if (i < num_tbird_hostnames) {
ret = ORTE_SUCCESS;
} else {
ret = ORTE_ERR_NOT_FOUND;
}
return ret;
}
static int
pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env)
{
int ret;
tm_node_id node_id;
tm_task_id task_id;
tm_event_t event;
/* get the tbird node id for this node */
ret = do_tbird_resolve(nodename, &node_id);
if (ORTE_SUCCESS != ret) return ret;
ret = tm_spawn(argc, argv, env, node_id, &task_id, &event);
if (TM_SUCCESS != ret) return ORTE_ERROR;
return ORTE_SUCCESS;
}
static int pls_tbird_check_path(char *exe, char **env)
{
static int size = 256;
int i;
char *file;
char *cwd;
char *path = NULL;
/* Do we want this check at all? */
if (!mca_pls_tbird_component.want_path_check) {
return ORTE_SUCCESS;
}
/* Find the path in the supplied environment */
for (i = 0; NULL != env[i]; ++i) {
if (0 == strncmp("PATH=", env[i], 5)) {
path = strdup(env[i]);
break;
}
}
if (NULL == env[i]) {
path = strdup("NULL");
}
/* Check the already-successful paths (i.e., be a little
friendlier to the filesystem -- if we find the executable
successfully, save it) */
for (i = 0; NULL != mca_pls_tbird_component.checked_paths &&
NULL != mca_pls_tbird_component.checked_paths[i]; ++i) {
if (0 == strcmp(path, mca_pls_tbird_component.checked_paths[i])) {
return ORTE_SUCCESS;
}
}
/* We didn't already find it, so check now. First, get the cwd. */
do {
cwd = malloc(size);
if (NULL == cwd) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == getcwd(cwd, size)) {
free(cwd);
if (ERANGE == errno) {
size *= 2;
} else {
return ORTE_ERR_IN_ERRNO;
}
} else {
break;
}
} while (1);
/* Now do the search */
file = opal_path_findv(exe, X_OK, env, cwd);
free(cwd);
if (NULL == file) {
free(path);
return ORTE_ERR_NOT_FOUND;
}
if (mca_pls_tbird_component.debug) {
opal_output(0, "pls:tbird: found %s", file);
}
free(file);
/* Success -- so cache it */
opal_argv_append_nosize(&mca_pls_tbird_component.checked_paths, path);
/* All done */
free(path);
return ORTE_SUCCESS;
}

Просмотреть файл

Просмотреть файл

@ -1,3 +0,0 @@
rhc
jsquyres
Ralph

Просмотреть файл

@ -1,51 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_rmgr_tbird_DSO
component_noinst =
component_install = mca_rmgr_tbird.la
else
component_noinst = libmca_rmgr_tbird.la
component_install =
endif
proxy_SOURCES = \
rmgr_tbird.c \
rmgr_tbird.h \
rmgr_tbird_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES)
mca_rmgr_tbird_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_rmgr_tbird_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES)
libmca_rmgr_tbird_la_LIBADD =
libmca_rmgr_tbird_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=rmgr_tbird.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,557 +0,0 @@
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/util/trace.h"
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rds/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/rmgr/tbird/rmgr_tbird.h"
static int orte_rmgr_tbird_query(void);
static int orte_rmgr_tbird_create(
orte_app_context_t** app_context,
size_t num_context,
orte_jobid_t* jobid);
static int orte_rmgr_tbird_allocate(
orte_jobid_t jobid);
static int orte_rmgr_tbird_deallocate(
orte_jobid_t jobid);
static int orte_rmgr_tbird_map(
orte_jobid_t jobid);
static int orte_rmgr_tbird_launch(
orte_jobid_t jobid);
static int orte_rmgr_tbird_terminate_job(
orte_jobid_t jobid);
static int orte_rmgr_tbird_terminate_proc(
const orte_process_name_t* proc_name);
static int orte_rmgr_tbird_signal_job(
orte_jobid_t jobid, int32_t signal);
static int orte_rmgr_tbird_signal_proc(
const orte_process_name_t* proc_name,
int32_t signal);
static int orte_rmgr_tbird_spawn(
orte_app_context_t** app_context,
size_t num_context,
orte_jobid_t* jobid,
orte_rmgr_cb_fn_t cbfn,
orte_proc_state_t cb_conditions);
static int orte_rmgr_tbird_finalize(void);
orte_rmgr_base_module_t orte_rmgr_tbird_module = {
orte_rmgr_tbird_query,
orte_rmgr_tbird_create,
orte_rmgr_tbird_allocate,
orte_rmgr_tbird_deallocate,
orte_rmgr_tbird_map,
orte_rmgr_tbird_launch,
orte_rmgr_tbird_terminate_job,
orte_rmgr_tbird_terminate_proc,
orte_rmgr_tbird_signal_job,
orte_rmgr_tbird_signal_proc,
orte_rmgr_tbird_spawn,
orte_rmgr_base_proc_stage_gate_init,
orte_rmgr_base_proc_stage_gate_mgr,
orte_rmgr_tbird_finalize
};
/*
* Resource discovery
*/
static int orte_rmgr_tbird_query(void)
{
int rc;
OPAL_TRACE(1);
if(ORTE_SUCCESS != (rc = orte_rds_base_query())) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/*
* Create the job segment and initialize the application context.
*/
static int orte_rmgr_tbird_create(
orte_app_context_t** app_context,
size_t num_context,
orte_jobid_t* jobid)
{
int rc;
OPAL_TRACE(1);
/* allocate a jobid */
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* create and initialize job segment */ /* JJH C/N mapping before this */
if (ORTE_SUCCESS !=
(rc = orte_rmgr_base_put_app_context(*jobid, app_context,
num_context))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int orte_rmgr_tbird_allocate(orte_jobid_t jobid)
{
OPAL_TRACE(1);
return orte_ras_base_allocate(jobid, &mca_rmgr_tbird_component.tbird_ras);
}
static int orte_rmgr_tbird_deallocate(orte_jobid_t jobid)
{
OPAL_TRACE(1);
return mca_rmgr_tbird_component.tbird_ras->deallocate(jobid);
}
static int orte_rmgr_tbird_map(orte_jobid_t jobid)
{
OPAL_TRACE(1);
return mca_rmgr_tbird_component.tbird_rmaps->map(jobid);
}
static int orte_rmgr_tbird_launch(orte_jobid_t jobid)
{
int ret, ret2;
OPAL_TRACE(1);
if (ORTE_SUCCESS !=
(ret = mca_rmgr_tbird_component.tbird_pls->launch(jobid))) {
ORTE_ERROR_LOG(ret);
ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED);
if (ORTE_SUCCESS != ret2) {
ORTE_ERROR_LOG(ret2);
return ret2;
}
}
return ret;
}
static int orte_rmgr_tbird_terminate_job(orte_jobid_t jobid)
{
int ret;
orte_jobid_t my_jobid;
OPAL_TRACE(1);
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
if (ORTE_SUCCESS == ret) {
/* if our jobid is the one we're trying to kill AND we're a
singleton, then calling the tbird_pls isn't going to be able
to do anything. Just call exit. */
if (orte_process_info.singleton && jobid == my_jobid) {
exit(1);
}
}
return mca_rmgr_tbird_component.tbird_pls->terminate_job(jobid);
}
static int orte_rmgr_tbird_terminate_proc(const orte_process_name_t* proc_name)
{
OPAL_TRACE(1);
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
orte_process_info.my_name)) &&
(orte_process_info.singleton)) {
/* if we're trying to get ourselves killed and we're a
singleton, calling terminate_proc isn't going to work
properly -- there's no pls setup properly for us. Just
call exit and be done. */
exit(1);
}
return mca_rmgr_tbird_component.tbird_pls->terminate_proc(proc_name);
}
static int orte_rmgr_tbird_signal_job(orte_jobid_t jobid, int32_t signal)
{
int ret;
orte_jobid_t my_jobid;
OPAL_TRACE(1);
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
if (ORTE_SUCCESS == ret) {
/** if our jobid is the one we're trying to signal AND we're a
* singleton, then calling the tbird_pls isn't going to be able
* to do anything - we already have the signal! */
if (orte_process_info.singleton && jobid == my_jobid) {
return ORTE_SUCCESS;
}
}
return mca_rmgr_tbird_component.tbird_pls->signal_job(jobid, signal);
}
static int orte_rmgr_tbird_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
{
OPAL_TRACE(1);
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
orte_process_info.my_name)) &&
(orte_process_info.singleton)) {
/** if we're trying to signal ourselves and we're a
* singleton, calling signal_proc isn't going to work
* properly -- there's no pls setup properly for us. Besides, we
* already have the signal!
*/
return ORTE_SUCCESS;
}
return mca_rmgr_tbird_component.tbird_pls->signal_proc(proc_name, signal);
}
static void orte_rmgr_tbird_wireup_stdin(orte_jobid_t jobid)
{
int rc;
orte_process_name_t* name;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, jobid, 0))) {
ORTE_ERROR_LOG(rc);
return;
}
if (ORTE_SUCCESS != (rc = orte_iof.iof_push(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDIN, 0))) {
ORTE_ERROR_LOG(rc);
}
}
static void orte_rmgr_tbird_callback(orte_gpr_notify_data_t *data, void *cbdata)
{
orte_rmgr_cb_fn_t cbfunc;
union {
orte_rmgr_cb_fn_t func;
void * ptr;
} cbfunc_union;
orte_gpr_value_t **values, *value;
orte_gpr_keyval_t** keyvals;
orte_jobid_t jobid;
size_t i, j, k;
int rc;
OPAL_TRACE(1);
/* stupid ISO C forbids conversion of object pointer to function
pointer. So we do this, which is the same thing, but without
the warning from GCC */
cbfunc_union.ptr = cbdata;
cbfunc = cbfunc_union.func;
/* we made sure in the subscriptions that at least one
* value is always returned
* get the jobid from the segment name in the first value
*/
values = (orte_gpr_value_t**)(data->values)->addr;
if (ORTE_SUCCESS != (rc =
orte_schema.extract_jobid_from_segment_name(&jobid,
values[0]->segment))) {
ORTE_ERROR_LOG(rc);
return;
}
for(i = 0, k=0; k < data->cnt &&
i < (data->values)->size; i++) {
if (NULL != values[i]) {
k++;
value = values[i];
/* determine the state change */
keyvals = value->keyvals;
for(j=0; j<value->cnt; j++) {
orte_gpr_keyval_t* keyval = keyvals[j];
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_INIT);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG2);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG3) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG3);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_FINALIZED) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_FINALIZED);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_TERMINATED) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_TERMINATED);
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NUM_ABORTED) == 0) {
(*cbfunc)(jobid,ORTE_PROC_STATE_ABORTED);
continue;
}
}
}
}
}
/**
* define a callback point for completing the wireup of the stdin for io forwarding
*/
static void orte_rmgr_tbird_wireup_callback(orte_gpr_notify_data_t *data, void *cbdata)
{
orte_gpr_value_t **values;
orte_jobid_t jobid;
int rc;
OPAL_TRACE(1);
/* we made sure in the subscriptions that at least one
* value is always returned
* get the jobid from the segment name in the first value
*/
values = (orte_gpr_value_t**)(data->values)->addr;
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_segment_name(&jobid, values[0]->segment))) {
ORTE_ERROR_LOG(rc);
return;
}
orte_rmgr_tbird_wireup_stdin(jobid);
}
/*
* Shortcut for the multiple steps involved in spawning a new job.
*/
static int orte_rmgr_tbird_spawn(
orte_app_context_t** app_context,
size_t num_context,
orte_jobid_t* jobid,
orte_rmgr_cb_fn_t cbfunc,
orte_proc_state_t cb_conditions)
{
int rc;
orte_process_name_t* name;
OPAL_TRACE(1);
/*
* Perform resource discovery.
*/
if (mca_rmgr_tbird_component.tbird_rds == false &&
ORTE_SUCCESS != (rc = orte_rds_base_query())) {
ORTE_ERROR_LOG(rc);
return rc;
} else {
mca_rmgr_tbird_component.tbird_rds = true;
}
/*
* Initialize job segment and allocate resources
*/ /* JJH Insert C/N mapping stuff here */
if (ORTE_SUCCESS !=
(rc = orte_rmgr_tbird_create(app_context,num_context,jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_allocate(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_map(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*
* setup I/O forwarding
*/
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, *jobid, 0))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDOUT, 1))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDERR, 2))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup the launch system's stage gate counters and subscriptions */
if (ORTE_SUCCESS !=
(rc = orte_rmgr_base_proc_stage_gate_init(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** setup the subscription so we can complete the wireup when all processes reach LAUNCHED */
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_wireup_callback, NULL, ORTE_PROC_STATE_LAUNCHED);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*
* setup callback
*/
if(NULL != cbfunc) {
union {
orte_rmgr_cb_fn_t func;
void * ptr;
} cbfunc_union;
void *cbdata;
/* stupid ISO C forbids conversion of object pointer to function
pointer. So we do this, which is the same thing, but without
the warning from GCC */
cbfunc_union.func = cbfunc;
cbdata = cbfunc_union.ptr;
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_callback, cbdata, cb_conditions);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/*
* launch the job
*/
if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_launch(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_ns.free_name(&name);
return ORTE_SUCCESS;
}
static int orte_rmgr_tbird_finalize(void)
{
int rc;
OPAL_TRACE(1);
/**
* Finalize Process Launch Subsystem (PLS)
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Finalize Resource Mapping Subsystem (RMAPS)
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Finalize Resource Allocation Subsystem (RAS)
*/
if (ORTE_SUCCESS != (rc = orte_ras_base_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Finalize Resource Discovery Subsystem (RDS)
*/
if (ORTE_SUCCESS != (rc = orte_rds_base_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Cancel pending receive. */
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_SVC);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,60 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Universal Resource Manager (tbird)
*/
#ifndef ORTE_RMGR_tbird_H
#define ORTE_RMGR_tbird_H
#include "orte/mca/rmgr/rmgr.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* tbird component structure -- add some stuff beyond what is in the
* normal rmgr component.
*/
struct orte_rmgr_tbird_component_t {
/** Base rmgr component */
orte_rmgr_base_component_t super;
/** Has RDS query been called */
bool tbird_rds;
/** Selected ras module */
orte_ras_base_module_t *tbird_ras;
/** Selected rmaps module */
orte_rmaps_base_module_t *tbird_rmaps;
/** Selected pls module */
orte_pls_base_module_t *tbird_pls;
};
/** Convenience typedef */
typedef struct orte_rmgr_tbird_component_t orte_rmgr_tbird_component_t;
/** Global tbird component */
OMPI_COMP_EXPORT extern orte_rmgr_tbird_component_t mca_rmgr_tbird_component;
/** Global tbird module */
OMPI_COMP_EXPORT extern orte_rmgr_base_module_t orte_rmgr_tbird_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,251 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "orte/dss/dss_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rds/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rml/rml.h"
#include "rmgr_tbird.h"
/*
* Local functions
*/
static int orte_rmgr_tbird_open(void);
static int orte_rmgr_tbird_close(void);
static orte_rmgr_base_module_t* orte_rmgr_tbird_init(int *priority);
orte_rmgr_tbird_component_t mca_rmgr_tbird_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a iof v1.0.0 component (which also
implies a specific MCA version) */
ORTE_RMGR_BASE_VERSION_1_0_0,
"tbird", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmgr_tbird_open, /* component open */
orte_rmgr_tbird_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
orte_rmgr_tbird_init
}
};
/**
* component open/close/init function
*/
static int orte_rmgr_tbird_open(void)
{
int rc;
/**
* Open Resource Discovery Subsystem (RDS)
*/
if (ORTE_SUCCESS != (rc = orte_rds_base_open())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Open Resource Allocation Subsystem (RAS)
*/
if (ORTE_SUCCESS != (rc = orte_ras_base_open())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Open Resource Mapping Subsystem (RMAPS)
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_open())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Open Process Launch Subsystem (PLS)
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_open())) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static void orte_rmgr_tbird_recv(
int status,
orte_process_name_t* peer,
orte_buffer_t* req,
orte_rml_tag_t tag,
void* cbdata)
{
int rc;
orte_buffer_t rsp;
OBJ_CONSTRUCT(&rsp, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_rmgr_base_cmd_dispatch(req,&rsp))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
rc = orte_rml.send_buffer(peer, &rsp, ORTE_RML_TAG_RMGR_CLNT, 0);
if (rc < 0) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&rsp);
}
static orte_rmgr_base_module_t *orte_rmgr_tbird_init(int* priority)
{
int rc;
char* pls = NULL;
if(orte_process_info.seed == false) {
/* if we are bootproxy - need to be selected */
int id = mca_base_param_register_int("rmgr","bootproxy","jobid",NULL,0);
int jobid = 0;
mca_base_param_lookup_int(id,&jobid);
if(jobid == 0) {
return NULL;
}
/* use fork pls for bootproxy */
id = mca_base_param_register_string("rmgr","bootproxy","pls",NULL,"fork");
mca_base_param_lookup_string(id,&pls);
}
/**
* Select RDS components.
*/
if (ORTE_SUCCESS != (rc = orte_rds_base_select())) {
ORTE_ERROR_LOG(rc);
return NULL;
}
mca_rmgr_tbird_component.tbird_rds = false;
/**
* Find available RAS components
*/
if (ORTE_SUCCESS != (rc = orte_ras_base_find_available())) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return NULL;
}
/**
* Select RMAPS component
*/
if (NULL == (mca_rmgr_tbird_component.tbird_rmaps = orte_rmaps_base_select(NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return NULL;
}
/**
* Select PLS component
*/
if (NULL == (mca_rmgr_tbird_component.tbird_pls = orte_pls_base_select(pls))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return NULL;
}
/* Post non-blocking receive */
if (0 > (rc = orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY,
ORTE_RML_TAG_RMGR_SVC,
ORTE_RML_PERSISTENT,
orte_rmgr_tbird_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
*priority = 100;
return &orte_rmgr_tbird_module;
}
/**
* Close all subsystems.
*/
static int orte_rmgr_tbird_close(void)
{
int rc;
/**
* Close Process Launch Subsystem (PLS)
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Close Resource Mapping Subsystem (RMAPS)
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Close Resource Allocation Subsystem (RAS)
*/
if (ORTE_SUCCESS != (rc = orte_ras_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* Close Resource Discovery Subsystem (RDS)
*/
if (ORTE_SUCCESS != (rc = orte_rds_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}