diff --git a/orte/mca/pls/tbird/.ompi_ignore b/orte/mca/pls/tbird/.ompi_ignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/orte/mca/pls/tbird/.ompi_unignore b/orte/mca/pls/tbird/.ompi_unignore deleted file mode 100644 index 7681be23d1..0000000000 --- a/orte/mca/pls/tbird/.ompi_unignore +++ /dev/null @@ -1,3 +0,0 @@ -rhc -jsquyres -Ralph diff --git a/orte/mca/pls/tbird/Makefile.am b/orte/mca/pls/tbird/Makefile.am deleted file mode 100644 index 3b8dd7e649..0000000000 --- a/orte/mca/pls/tbird/Makefile.am +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(pls_tbird_CPPFLAGS) - -dist_pkgdata_DATA = help-pls-tbird.txt - -sources = \ - pls_tbird.h \ - pls_tbird_component.c \ - pls_tbird_module.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if OMPI_BUILD_pls_tbird_DSO -lib = -lib_sources = -component = mca_pls_tbird.la -component_sources = $(sources) -else -lib = libmca_pls_tbird.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(libdir)/openmpi -mcacomponent_LTLIBRARIES = $(component) -mca_pls_tbird_la_SOURCES = $(component_sources) -mca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS) -mca_pls_tbird_la_LIBADD = \ - $(pls_tbird_LIBS) \ - $(top_ompi_builddir)/orte/liborte.la \ - $(top_ompi_builddir)/opal/libopal.la - -noinst_LTLIBRARIES = $(lib) -libmca_pls_tbird_la_SOURCES = $(lib_sources) -libmca_pls_tbird_la_LDFLAGS = -module -avoid-version $(pls_tbird_LDFLAGS) -libmca_pls_tbird_la_LIBADD = $(pls_tbird_LIBS) diff --git a/orte/mca/pls/tbird/configure.m4 b/orte/mca/pls/tbird/configure.m4 deleted file mode 100644 index efc08a5f96..0000000000 --- a/orte/mca/pls/tbird/configure.m4 +++ /dev/null @@ -1,38 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_pls_tbird_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_pls_tbird_CONFIG],[ - OMPI_CHECK_TM([pls_tbird], [pls_tbird_good=1], [pls_tbird_good=0]) - - # if check worked, set wrapper flags if so. - # Evaluate succeed / fail - AS_IF([test "$pls_tbird_good" = "1"], - [pls_tbird_WRAPPER_EXTRA_LDFLAGS="$pls_tbird_LDFLAGS" - pls_tbird_WRAPPER_EXTRA_LIBS="$pls_tbird_LIBS" - $1], - [$2]) - - # set build flags to use in makefile - AC_SUBST([pls_tbird_CPPFLAGS]) - AC_SUBST([pls_tbird_LDFLAGS]) - AC_SUBST([pls_tbird_LIBS]) -])dnl diff --git a/orte/mca/pls/tbird/configure.params b/orte/mca/pls/tbird/configure.params deleted file mode 100644 index 39549177b2..0000000000 --- a/orte/mca/pls/tbird/configure.params +++ /dev/null @@ -1,22 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -PARAM_INIT_FILE=src/pls_tbird_component.c -PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/pls/tbird/help-pls-tbird.txt b/orte/mca/pls/tbird/help-pls-tbird.txt deleted file mode 100644 index d543ab7b5b..0000000000 --- a/orte/mca/pls/tbird/help-pls-tbird.txt +++ /dev/null @@ -1,44 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[multiple-prefixes] -Multiple different --prefix options were specified to mpirun for the -same node. This is a fatal error for the TM (PBS / Torque) process -starter in Open MPI. - -The first two prefix values supplied for node %s were: - %s -and %s -# -[daemon-not-found] -The TM (PBS / Torqus) process starter in Open MPI was unable to find -its daemon executable (orted) on the node where mpirun was executed. - -This sanity check is performed because the back-end PBS / Torque -process launcher does not provide any kind of error to Open MPI if it -tries to launch its daemon on a remote node, but the daemon cannot be -found. Open MPI's check for the daemon locally is somewhat of a lame -workaround / sanity check. - -If you do not understand this error mesage, please try the following: - -1. Try to add the Open MPI executables to your PATH -2. Use the --prefix option to mpirun to indicate where Open MPI can - find its executables -3. Set the MCA parameter "pls_tm_want_path_check" to 0 -4. Talk to your local system administration diff --git a/orte/mca/pls/tbird/pls_tbird.h b/orte/mca/pls/tbird/pls_tbird.h deleted file mode 100644 index 8b8b854505..0000000000 --- a/orte/mca/pls/tbird/pls_tbird.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_PLS_TM_EXPORT_H -#define ORTE_PLS_TM_EXPORT_H - -#include "orte_config.h" - -#include "opal/mca/mca.h" -#include "orte/mca/pls/pls.h" - -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif - - struct orte_pls_tbird_component_t { - orte_pls_base_component_t super; - int priority; - int debug; - int verbose; - bool want_path_check; - char *orted; - char **checked_paths; - }; - typedef struct orte_pls_tbird_component_t orte_pls_tbird_component_t; - - /* Globally exported variables */ - OMPI_COMP_EXPORT extern orte_pls_tbird_component_t mca_pls_tbird_component; - extern orte_pls_base_module_1_0_0_t orte_pls_tbird_module; - - - -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif -#endif /* ORTE_PLS_TM_EXPORT_H */ diff --git a/orte/mca/pls/tbird/pls_tbird_component.c b/orte/mca/pls/tbird/pls_tbird_component.c deleted file mode 100644 index 0aed6ad52d..0000000000 --- a/orte/mca/pls/tbird/pls_tbird_component.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" - -#include "opal/mca/base/mca_base_param.h" -#include "opal/util/output.h" -#include "opal/util/argv.h" -#include "orte/orte_constants.h" -#include "orte/mca/pls/pls.h" -#include "orte/mca/pls/base/base.h" -#include "pls_tbird.h" - - -/* - * Public string showing the pls ompi_tbird component version number - */ -const char *mca_pls_tbird_component_version_string = - "Open MPI tbird pls MCA component version " ORTE_VERSION; - - - -/* - * Local function - */ -static int pls_tbird_open(void); -static int pls_tbird_close(void); -static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority); - - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -orte_pls_tbird_component_t mca_pls_tbird_component = { - { - /* First, the mca_component_t struct containing meta information - about the component itself */ - - { - /* Indicate that we are a pls v1.0.0 component (which also - implies a specific MCA version) */ - ORTE_PLS_BASE_VERSION_1_0_0, - - /* Component name and version */ - "tbird", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - pls_tbird_open, - pls_tbird_close, - }, - - /* Next the MCA v1.0.0 component meta data */ - { - /* Whether the component is checkpointable or not */ - true - }, - - /* Initialization / querying functions */ - pls_tbird_init - } -}; - - -static int pls_tbird_open(void) -{ - int tbirdp; - mca_base_component_t *comp = &mca_pls_tbird_component.super.pls_version; - - mca_base_param_reg_int(comp, "debug", "Enable debugging of the TBIRD pls", - false, false, 0, &mca_pls_tbird_component.debug); - mca_base_param_reg_int(comp, "verbose", "Enable verbose output of the TBIRD pls", - false, false, 0, &mca_pls_tbird_component.verbose); - - mca_base_param_reg_int(comp, "priority", "Default selection priority", - false, false, 75, &mca_pls_tbird_component.priority); - - mca_base_param_reg_string(comp, "orted", - "Command to use to start proxy orted", - false, false, "orted", - &mca_pls_tbird_component.orted); - mca_base_param_reg_int(comp, "want_path_check", - "Whether the launching process should check for the pls_tbird_orted executable in the PATH before launching (the TM API does not give an idication of failure; this is a somewhat-lame workaround; non-zero values enable this check)", - false, false, (int) true, &tbirdp); - mca_pls_tbird_component.want_path_check = (bool) tbirdp; - - mca_pls_tbird_component.checked_paths = NULL; - - return ORTE_SUCCESS; -} - - -static int pls_tbird_close(void) -{ - if (NULL != mca_pls_tbird_component.checked_paths) { - opal_argv_free(mca_pls_tbird_component.checked_paths); - } - - return ORTE_SUCCESS; -} - - -static struct orte_pls_base_module_1_0_0_t *pls_tbird_init(int *priority) -{ - /* Are we running under a TM job? */ - - if (NULL != getenv("PBS_ENVIRONMENT") && - NULL != getenv("PBS_JOBID")) { - *priority = mca_pls_tbird_component.priority; - return &orte_pls_tbird_module; - } - - /* Sadly, no */ - - opal_output(orte_pls_base.pls_output, - "pls:tbird: NOT available for selection"); - return NULL; -} diff --git a/orte/mca/pls/tbird/pls_tbird_module.c b/orte/mca/pls/tbird/pls_tbird_module.c deleted file mode 100644 index c60a8d925d..0000000000 --- a/orte/mca/pls/tbird/pls_tbird_module.c +++ /dev/null @@ -1,717 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * These symbols are in a file by themselves to provide nice linker - * semantics. Since linkers generally pull in symbols by object - * files, keeping these symbols as the only symbols in this file - * prevents utility programs such as "ompi_info" from having to import - * entire components just to query their version and parameters. - */ - -#include "orte_config.h" - -#if HAVE_UNISTD_H -#include -#endif -#include -#include -#include -#ifdef HAVE_SCHED_H -#include -#endif -#include -#include - -#include "opal/install_dirs.h" -#include "opal/event/event.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/util/opal_environ.h" -#include "opal/util/show_help.h" -#include "opal/util/path.h" -#include "opal/util/basename.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/runtime/opal_progress.h" -#include "orte/orte_constants.h" -#include "orte/orte_types.h" -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/rmgr/base/base.h" -#include "orte/mca/rmaps/base/rmaps_base_map.h" -#include "orte/mca/pls/pls.h" -#include "orte/mca/pls/base/base.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/soh/soh_types.h" -#include "orte/mca/gpr/gpr.h" -#include "orte/mca/sds/base/base.h" -#include "orte/mca/soh/soh.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/ns/ns.h" -#include "pls_tbird.h" - - - -/* - * Local functions - */ -static int pls_tbird_launch(orte_jobid_t jobid); -static int pls_tbird_terminate_job(orte_jobid_t jobid); -static int pls_tbird_terminate_proc(const orte_process_name_t *name); -static int pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal); -static int pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal); -static int pls_tbird_finalize(void); - -static int pls_tbird_connect(void); -static int pls_tbird_disconnect(void); -static int pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env); -static int pls_tbird_check_path(char *exe, char **env); - -/* - * Global variable - */ -orte_pls_base_module_1_0_0_t orte_pls_tbird_module = { - pls_tbird_launch, - pls_tbird_terminate_job, - pls_tbird_terminate_proc, - pls_tbird_signal_job, - pls_tbird_signal_proc, - pls_tbird_finalize -}; - - -extern char **environ; - - -static int -pls_tbird_launch(orte_jobid_t jobid) -{ - opal_list_t mapping; - opal_list_item_t *m_item, *n_item; - size_t num_nodes; - orte_vpid_t vpid; - int node_name_index; - int proc_name_index; - char *jobid_string; - char *uri, *param; - char **argv; - int argc; - int rc; - bool connected = false; - int launched = 0, i; - char *bin_base = NULL, *lib_base = NULL; - - /* Query the list of nodes allocated and mapped to this job. - * We need the entire mapping for a couple of reasons: - * - need the prefix to start with. - * - need to know if we are launching on a subset of the allocated nodes - */ - OBJ_CONSTRUCT(&mapping, opal_list_t); - rc = orte_rmaps_base_get_map(jobid, &mapping); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - - num_nodes = 0; - for(m_item = opal_list_get_first(&mapping); - m_item != opal_list_get_end(&mapping); - m_item = opal_list_get_next(m_item)) { - orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item; - num_nodes += opal_list_get_size(&map->nodes); - } - - /* - * Allocate a range of vpids for the daemons. - */ - if (num_nodes == 0) { - return ORTE_ERR_BAD_PARAM; - } - rc = orte_ns.reserve_range(0, num_nodes, &vpid); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - - /* need integer value for command line parameter */ - asprintf(&jobid_string, "%lu", (unsigned long) jobid); - - /* add the daemon command (as specified by user) */ - argv = opal_argv_split(mca_pls_tbird_component.orted, ' '); - argc = opal_argv_count(argv); - - opal_argv_append(&argc, &argv, "--no-daemonize"); - - /* check for debug flags */ - orte_pls_base_proxy_mca_argv(&argc, &argv); - - /* proxy information */ - opal_argv_append(&argc, &argv, "--bootproxy"); - opal_argv_append(&argc, &argv, jobid_string); - opal_argv_append(&argc, &argv, "--name"); - proc_name_index = argc; - opal_argv_append(&argc, &argv, ""); - - /* tell the daemon how many procs are in the daemon's job */ - opal_argv_append(&argc, &argv, "--num_procs"); - asprintf(¶m, "%lu", (unsigned long)(vpid + num_nodes)); - opal_argv_append(&argc, &argv, param); - free(param); - - /* tell the daemon the starting vpid of the daemon's job */ - opal_argv_append(&argc, &argv, "--vpid_start"); - opal_argv_append(&argc, &argv, "0"); - - opal_argv_append(&argc, &argv, "--nodename"); - node_name_index = argc; - opal_argv_append(&argc, &argv, ""); - - /* pass along the universe name and location info */ - opal_argv_append(&argc, &argv, "--universe"); - asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, - orte_universe_info.host, orte_universe_info.name); - opal_argv_append(&argc, &argv, param); - free(param); - - /* setup ns contact info */ - opal_argv_append(&argc, &argv, "--nsreplica"); - if (NULL != orte_process_info.ns_replica_uri) { - uri = strdup(orte_process_info.ns_replica_uri); - } else { - uri = orte_rml.get_uri(); - } - asprintf(¶m, "\"%s\"", uri); - opal_argv_append(&argc, &argv, param); - free(uri); - free(param); - - /* setup gpr contact info */ - opal_argv_append(&argc, &argv, "--gprreplica"); - if (NULL != orte_process_info.gpr_replica_uri) { - uri = strdup(orte_process_info.gpr_replica_uri); - } else { - uri = orte_rml.get_uri(); - } - asprintf(¶m, "\"%s\"", uri); - opal_argv_append(&argc, &argv, param); - free(uri); - free(param); - - if (mca_pls_tbird_component.debug) { - param = opal_argv_join(argv, ' '); - if (NULL != param) { - opal_output(0, "pls:tbird: final top-level argv:"); - opal_output(0, "pls:tbird: %s", param); - free(param); - } - } - - rc = pls_tbird_connect(); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - connected = true; - - /* Figure out the basenames for the libdir and bindir. There is a - lengthy comment about this in pls_rsh_module.c explaining all - the rationale for how / why we're doing this. */ - - lib_base = opal_basename(OPAL_LIBDIR); - bin_base = opal_basename(OPAL_BINDIR); - - /* - * iterate through each of the contexts - */ - for (m_item = opal_list_get_first(&mapping); - m_item != opal_list_get_end(&mapping); - m_item = opal_list_get_next(m_item)) { - orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item; - char** env; - char* var; - - /* setup environment */ - env = opal_argv_copy(environ); - var = mca_base_param_environ_variable("seed",NULL,NULL); - opal_setenv(var, "0", true, &env); - - /* If we have a prefix, then modify the PATH and - LD_LIBRARY_PATH environment variables. */ - if (NULL != map->app->prefix_dir) { - int i; - char *newenv; - - for (i = 0; NULL != env && NULL != env[i]; ++i) { - /* Reset PATH */ - if (0 == strncmp("PATH=", env[i], 5)) { - asprintf(&newenv, "%s/%s:%s", - map->app->prefix_dir, bin_base, env[i] + 5); - if (mca_pls_tbird_component.debug) { - opal_output(0, "pls:tbird: resetting PATH: %s", - newenv); - } - opal_setenv("PATH", newenv, true, &env); - free(newenv); - } - - /* Reset LD_LIBRARY_PATH */ - else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) { - asprintf(&newenv, "%s/%s:%s", - map->app->prefix_dir, lib_base, env[i] + 16); - if (mca_pls_tbird_component.debug) { - opal_output(0, "pls:tbird: resetting LD_LIBRARY_PATH: %s", - newenv); - } - opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); - free(newenv); - } - } - } - - /* Do a quick sanity check to ensure that we can find the - orted in the PATH */ - - if (ORTE_SUCCESS != - (rc = pls_tbird_check_path(argv[0], env))) { - ORTE_ERROR_LOG(rc); - opal_show_help("help-pls-tbird.txt", "daemon-not-found", - true, argv[0]); - goto cleanup; - } - - /* Iterate through each of the nodes and spin - * up a daemon. - */ - for (n_item = opal_list_get_first(&map->nodes); - n_item != opal_list_get_end(&map->nodes); - n_item = opal_list_get_next(n_item)) { - orte_rmaps_base_node_t* rmaps_node = (orte_rmaps_base_node_t*)n_item; - orte_ras_node_t* node = rmaps_node->node; - orte_process_name_t* name; - char* name_string; - - /* already launched on this node */ - if (0 != node->node_launched++) { - continue; - } - - /* setup node name */ - argv[node_name_index] = node->node_name; - - /* initialize daemons process name */ - rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* setup per-node options */ - if (mca_pls_tbird_component.debug || - mca_pls_tbird_component.verbose) { - opal_output(0, "pls:tbird: launching on node %s", - node->node_name); - } - - /* setup process name */ - rc = orte_ns.get_proc_name_string(&name_string, name); - if (ORTE_SUCCESS != rc) { - opal_output(0, "pls:tbird: unable to create process name"); - return rc; - } - argv[proc_name_index] = name_string; - - /* set the progress engine schedule for this node. - * if node_slots is set to zero, then we default to - * NOT being oversubscribed - */ - if (node->node_slots > 0 && - opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) { - if (mca_pls_tbird_component.debug) { - opal_output(0, "pls:tbird: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)", - node->node_slots, - opal_list_get_size(&rmaps_node->node_procs)); - } - var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); - opal_setenv(var, "1", true, &env); - } else { - if (mca_pls_tbird_component.debug) { - opal_output(0, "pls:tbird: not oversubscribed -- setting mpi_yield_when_idle to 0"); - } - var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); - opal_setenv(var, "0", true, &env); - } - free(var); - - /* save the daemons name on the node */ - if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* exec the daemon */ - if (mca_pls_tbird_component.debug) { - param = opal_argv_join(argv, ' '); - if (NULL != param) { - opal_output(0, "pls:tbird: executing: %s", param); - free(param); - } - } - - rc = pls_tbird_start_proc(node->node_name, argc, argv, env); - if (ORTE_SUCCESS != rc) { - opal_output(0, "pls:tbird: start_procs returned error %d", rc); - goto cleanup; - } - launched++; - vpid++; - free(name); - opal_event_loop(OPAL_EVLOOP_NONBLOCK); - } - - } - - /* loop through all those that are launched and poll for - completion status */ - - for(i = 0; i < launched; i++){ - int ret, local_err; - tm_event_t event; - ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); - if (TM_SUCCESS != ret) { - errno = local_err; - opal_output(0, "pls:tbird: failed to start a proc error %d", ret); - goto cleanup; - } - } - - cleanup: - if (connected) { - pls_tbird_disconnect(); - } - - while (NULL != (m_item = opal_list_remove_first(&mapping))) { - OBJ_RELEASE(m_item); - } - OBJ_DESTRUCT(&mapping); - if (NULL != lib_base) { - free(lib_base); - } - if (NULL != bin_base) { - free(bin_base); - } - - return rc; -} - - -static int -pls_tbird_terminate_job(orte_jobid_t jobid) -{ - return orte_pls_base_proxy_terminate_job(jobid); -} - - -/* - * TM can't kill individual processes -- PBS will kill the entire job - */ -static int -pls_tbird_terminate_proc(const orte_process_name_t *name) -{ - opal_output(orte_pls_base.pls_output, - "pls:tbird:terminate_proc: not supported"); - return ORTE_ERR_NOT_SUPPORTED; -} - - -static int -pls_tbird_signal_job(orte_jobid_t jobid, int32_t signal) -{ - return orte_pls_base_proxy_signal_job(jobid, signal); -} - - -static int -pls_tbird_signal_proc(const orte_process_name_t *name, int32_t signal) -{ - return orte_pls_base_proxy_signal_proc(name, signal); -} - - -/* - * Free stuff - */ -static int -pls_tbird_finalize(void) -{ - /* cleanup any pending recvs */ - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_CLNT); - - return ORTE_SUCCESS; -} - - -static int -pls_tbird_connect(void) -{ - int ret; - struct tm_roots tm_root; - int count, progress; - - /* try a couple times to connect - might get busy signals every - now and then */ - for (count = 0 ; count < 10; ++count) { - ret = tm_init(NULL, &tbird_root); - if (TM_SUCCESS == ret) { - return ORTE_SUCCESS; - } - - for (progress = 0 ; progress < 10 ; ++progress) { - opal_progress(); -#if HAVE_SCHED_YIELD - sched_yield(); -#endif - } - } - - return ORTE_ERR_RESOURCE_BUSY; -} - - -static int -pls_tbird_disconnect(void) -{ - tm_finalize(); - - return ORTE_SUCCESS; -} - -static char **tbird_hostnames = NULL; -static tm_node_id *tbird_node_ids = NULL; -static int num_tbird_hostnames, num_node_ids; - - - -/* we don't call this anymore */ -/* - * For a given TM node ID, get the string hostname corresponding to - * it. - */ -static char* -get_tbird_hostname(tbird_node_id node) -{ - char *hostname; - char buffer[256]; - int ret, local_errno; - tm_event_t event; - char **argv; - - /* Get the info string corresponding to this TM node ID */ - - ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event); - if (TM_SUCCESS != ret) { - return NULL; - } - - /* Now wait for that event to happen */ - - ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno); - if (TM_SUCCESS != ret) { - return NULL; - } - - /* According to the TM man page, we get back a space-separated - string array. The hostname is the second item. Use a cheap - trick to get it. */ - - buffer[sizeof(buffer) - 1] = '\0'; - argv = opal_argv_split(buffer, ' '); - if (NULL == argv) { - return NULL; - } - hostname = strdup(argv[1]); - opal_argv_free(argv); - - /* All done */ - - return hostname; -} - - -/* we don't call this anymore!*/ -static int -query_tbird_hostnames(void) -{ - char *h; - int i, ret; - - /* Get the list of nodes allocated in this PBS job */ - - ret = tm_nodeinfo(&tbird_node_ids, &num_node_ids); - if (TM_SUCCESS != ret) { - return ORTE_ERR_NOT_FOUND; - } - - /* TM "nodes" may actually correspond to PBS "VCPUs", which means - there may be multiple "TM nodes" that correspond to the same - physical node. This doesn't really affect what we're doing - here (we actually ignore the fact that they're duplicates -- - slightly inefficient, but no big deal); just mentioned for - completeness... */ - - tm_hostnames = NULL; - num_tbird_hostnames = 0; - for (i = 0; i < num_node_ids; ++i) { - h = get_tbird_hostname(tbird_node_ids[i]); - opal_argv_append(&num_tbird_hostnames, &tbird_hostnames, h); - free(h); - } - - /* All done */ - - return ORTE_SUCCESS; -} - -/* we don't call this anymore! */ -static int -do_tbird_resolve(char *hostname, tm_node_id *tnodeid) -{ - int i, ret; - - /* Have we already queried TM for all the node info? */ - if (NULL == tm_hostnames) { - ret = query_tbird_hostnames(); - if (ORTE_SUCCESS != ret) { - return ret; - } - } - - /* Find the TM ID of the hostname that we're looking for */ - for (i = 0; i < num_tbird_hostnames; ++i) { - if (0 == strcmp(hostname, tm_hostnames[i])) { - *tnodeid = tm_node_ids[i]; - opal_output(orte_pls_base.pls_output, - "pls:tbird:launch: resolved host %s to node ID %d", - hostname, tm_node_ids[i]); - break; - } - } - - /* All done */ - if (i < num_tbird_hostnames) { - ret = ORTE_SUCCESS; - } else { - ret = ORTE_ERR_NOT_FOUND; - } - - return ret; -} - - -static int -pls_tbird_start_proc(char *nodename, int argc, char **argv, char **env) -{ - int ret; - tm_node_id node_id; - tm_task_id task_id; - tm_event_t event; - - /* get the tbird node id for this node */ - ret = do_tbird_resolve(nodename, &node_id); - if (ORTE_SUCCESS != ret) return ret; - - ret = tm_spawn(argc, argv, env, node_id, &task_id, &event); - if (TM_SUCCESS != ret) return ORTE_ERROR; - - return ORTE_SUCCESS; -} - - -static int pls_tbird_check_path(char *exe, char **env) -{ - static int size = 256; - int i; - char *file; - char *cwd; - char *path = NULL; - - /* Do we want this check at all? */ - - if (!mca_pls_tbird_component.want_path_check) { - return ORTE_SUCCESS; - } - - /* Find the path in the supplied environment */ - - for (i = 0; NULL != env[i]; ++i) { - if (0 == strncmp("PATH=", env[i], 5)) { - path = strdup(env[i]); - break; - } - } - if (NULL == env[i]) { - path = strdup("NULL"); - } - - /* Check the already-successful paths (i.e., be a little - friendlier to the filesystem -- if we find the executable - successfully, save it) */ - - for (i = 0; NULL != mca_pls_tbird_component.checked_paths && - NULL != mca_pls_tbird_component.checked_paths[i]; ++i) { - if (0 == strcmp(path, mca_pls_tbird_component.checked_paths[i])) { - return ORTE_SUCCESS; - } - } - - /* We didn't already find it, so check now. First, get the cwd. */ - - do { - cwd = malloc(size); - if (NULL == cwd) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - if (NULL == getcwd(cwd, size)) { - free(cwd); - if (ERANGE == errno) { - size *= 2; - } else { - return ORTE_ERR_IN_ERRNO; - } - } else { - break; - } - } while (1); - - /* Now do the search */ - - file = opal_path_findv(exe, X_OK, env, cwd); - free(cwd); - if (NULL == file) { - free(path); - return ORTE_ERR_NOT_FOUND; - } - if (mca_pls_tbird_component.debug) { - opal_output(0, "pls:tbird: found %s", file); - } - free(file); - - /* Success -- so cache it */ - - opal_argv_append_nosize(&mca_pls_tbird_component.checked_paths, path); - - /* All done */ - - free(path); - return ORTE_SUCCESS; -} diff --git a/orte/mca/rmgr/tbird/.ompi_ignore b/orte/mca/rmgr/tbird/.ompi_ignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/orte/mca/rmgr/tbird/.ompi_unignore b/orte/mca/rmgr/tbird/.ompi_unignore deleted file mode 100644 index 7681be23d1..0000000000 --- a/orte/mca/rmgr/tbird/.ompi_unignore +++ /dev/null @@ -1,3 +0,0 @@ -rhc -jsquyres -Ralph diff --git a/orte/mca/rmgr/tbird/Makefile.am b/orte/mca/rmgr/tbird/Makefile.am deleted file mode 100644 index 479cf7c464..0000000000 --- a/orte/mca/rmgr/tbird/Makefile.am +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Use the top-level Makefile.options - - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if OMPI_BUILD_rmgr_tbird_DSO -component_noinst = -component_install = mca_rmgr_tbird.la -else -component_noinst = libmca_rmgr_tbird.la -component_install = -endif - -proxy_SOURCES = \ - rmgr_tbird.c \ - rmgr_tbird.h \ - rmgr_tbird_component.c - -mcacomponentdir = $(libdir)/openmpi -mcacomponent_LTLIBRARIES = $(component_install) -mca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES) -mca_rmgr_tbird_la_LIBADD = \ - $(top_ompi_builddir)/orte/liborte.la \ - $(top_ompi_builddir)/opal/libopal.la -mca_rmgr_tbird_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rmgr_tbird_la_SOURCES = $(proxy_SOURCES) -libmca_rmgr_tbird_la_LIBADD = -libmca_rmgr_tbird_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmgr/tbird/configure.params b/orte/mca/rmgr/tbird/configure.params deleted file mode 100644 index 0a16796629..0000000000 --- a/orte/mca/rmgr/tbird/configure.params +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module - -PARAM_INIT_FILE=rmgr_tbird.c -PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/rmgr/tbird/rmgr_tbird.c b/orte/mca/rmgr/tbird/rmgr_tbird.c deleted file mode 100644 index afbee5dcf0..0000000000 --- a/orte/mca/rmgr/tbird/rmgr_tbird.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "orte_config.h" -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif /* HAVE_STRING_H */ - -#include "opal/util/trace.h" - -#include "orte/orte_constants.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rds/base/base.h" -#include "orte/mca/ras/base/base.h" -#include "orte/mca/rmaps/base/base.h" -#include "orte/mca/rmgr/base/base.h" -#include "orte/mca/pls/base/base.h" -#include "orte/mca/gpr/gpr.h" -#include "orte/mca/iof/iof.h" -#include "orte/mca/ns/ns.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/soh/soh.h" - -#include "orte/mca/rmgr/tbird/rmgr_tbird.h" - - -static int orte_rmgr_tbird_query(void); - -static int orte_rmgr_tbird_create( - orte_app_context_t** app_context, - size_t num_context, - orte_jobid_t* jobid); - -static int orte_rmgr_tbird_allocate( - orte_jobid_t jobid); - -static int orte_rmgr_tbird_deallocate( - orte_jobid_t jobid); - -static int orte_rmgr_tbird_map( - orte_jobid_t jobid); - -static int orte_rmgr_tbird_launch( - orte_jobid_t jobid); - -static int orte_rmgr_tbird_terminate_job( - orte_jobid_t jobid); - -static int orte_rmgr_tbird_terminate_proc( - const orte_process_name_t* proc_name); - -static int orte_rmgr_tbird_signal_job( - orte_jobid_t jobid, int32_t signal); - -static int orte_rmgr_tbird_signal_proc( - const orte_process_name_t* proc_name, - int32_t signal); - -static int orte_rmgr_tbird_spawn( - orte_app_context_t** app_context, - size_t num_context, - orte_jobid_t* jobid, - orte_rmgr_cb_fn_t cbfn, - orte_proc_state_t cb_conditions); - -static int orte_rmgr_tbird_finalize(void); - - -orte_rmgr_base_module_t orte_rmgr_tbird_module = { - orte_rmgr_tbird_query, - orte_rmgr_tbird_create, - orte_rmgr_tbird_allocate, - orte_rmgr_tbird_deallocate, - orte_rmgr_tbird_map, - orte_rmgr_tbird_launch, - orte_rmgr_tbird_terminate_job, - orte_rmgr_tbird_terminate_proc, - orte_rmgr_tbird_signal_job, - orte_rmgr_tbird_signal_proc, - orte_rmgr_tbird_spawn, - orte_rmgr_base_proc_stage_gate_init, - orte_rmgr_base_proc_stage_gate_mgr, - orte_rmgr_tbird_finalize -}; - - -/* - * Resource discovery - */ - -static int orte_rmgr_tbird_query(void) -{ - int rc; - - OPAL_TRACE(1); - - if(ORTE_SUCCESS != (rc = orte_rds_base_query())) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; -} - - -/* - * Create the job segment and initialize the application context. - */ - -static int orte_rmgr_tbird_create( - orte_app_context_t** app_context, - size_t num_context, - orte_jobid_t* jobid) -{ - int rc; - - OPAL_TRACE(1); - - /* allocate a jobid */ - if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* create and initialize job segment */ /* JJH C/N mapping before this */ - if (ORTE_SUCCESS != - (rc = orte_rmgr_base_put_app_context(*jobid, app_context, - num_context))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - - -static int orte_rmgr_tbird_allocate(orte_jobid_t jobid) -{ - OPAL_TRACE(1); - - return orte_ras_base_allocate(jobid, &mca_rmgr_tbird_component.tbird_ras); -} - -static int orte_rmgr_tbird_deallocate(orte_jobid_t jobid) -{ - OPAL_TRACE(1); - - return mca_rmgr_tbird_component.tbird_ras->deallocate(jobid); -} - -static int orte_rmgr_tbird_map(orte_jobid_t jobid) -{ - OPAL_TRACE(1); - - return mca_rmgr_tbird_component.tbird_rmaps->map(jobid); -} - -static int orte_rmgr_tbird_launch(orte_jobid_t jobid) -{ - int ret, ret2; - - OPAL_TRACE(1); - - if (ORTE_SUCCESS != - (ret = mca_rmgr_tbird_component.tbird_pls->launch(jobid))) { - ORTE_ERROR_LOG(ret); - ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED); - if (ORTE_SUCCESS != ret2) { - ORTE_ERROR_LOG(ret2); - return ret2; - } - } - - return ret; -} - -static int orte_rmgr_tbird_terminate_job(orte_jobid_t jobid) -{ - int ret; - orte_jobid_t my_jobid; - - OPAL_TRACE(1); - - ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); - if (ORTE_SUCCESS == ret) { - /* if our jobid is the one we're trying to kill AND we're a - singleton, then calling the tbird_pls isn't going to be able - to do anything. Just call exit. */ - if (orte_process_info.singleton && jobid == my_jobid) { - exit(1); - } - } - - return mca_rmgr_tbird_component.tbird_pls->terminate_job(jobid); -} - -static int orte_rmgr_tbird_terminate_proc(const orte_process_name_t* proc_name) -{ - OPAL_TRACE(1); - - if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name, - orte_process_info.my_name)) && - (orte_process_info.singleton)) { - /* if we're trying to get ourselves killed and we're a - singleton, calling terminate_proc isn't going to work - properly -- there's no pls setup properly for us. Just - call exit and be done. */ - exit(1); - } - - return mca_rmgr_tbird_component.tbird_pls->terminate_proc(proc_name); -} - - -static int orte_rmgr_tbird_signal_job(orte_jobid_t jobid, int32_t signal) -{ - int ret; - orte_jobid_t my_jobid; - - OPAL_TRACE(1); - - ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); - if (ORTE_SUCCESS == ret) { - /** if our jobid is the one we're trying to signal AND we're a - * singleton, then calling the tbird_pls isn't going to be able - * to do anything - we already have the signal! */ - if (orte_process_info.singleton && jobid == my_jobid) { - return ORTE_SUCCESS; - } - } - - return mca_rmgr_tbird_component.tbird_pls->signal_job(jobid, signal); -} - -static int orte_rmgr_tbird_signal_proc(const orte_process_name_t* proc_name, int32_t signal) -{ - OPAL_TRACE(1); - - if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name, - orte_process_info.my_name)) && - (orte_process_info.singleton)) { - /** if we're trying to signal ourselves and we're a - * singleton, calling signal_proc isn't going to work - * properly -- there's no pls setup properly for us. Besides, we - * already have the signal! - */ - return ORTE_SUCCESS; - } - - return mca_rmgr_tbird_component.tbird_pls->signal_proc(proc_name, signal); -} - - -static void orte_rmgr_tbird_wireup_stdin(orte_jobid_t jobid) -{ - int rc; - orte_process_name_t* name; - - OPAL_TRACE(1); - - if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, jobid, 0))) { - ORTE_ERROR_LOG(rc); - return; - } - if (ORTE_SUCCESS != (rc = orte_iof.iof_push(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDIN, 0))) { - ORTE_ERROR_LOG(rc); - } -} - - -static void orte_rmgr_tbird_callback(orte_gpr_notify_data_t *data, void *cbdata) -{ - orte_rmgr_cb_fn_t cbfunc; - union { - orte_rmgr_cb_fn_t func; - void * ptr; - } cbfunc_union; - orte_gpr_value_t **values, *value; - orte_gpr_keyval_t** keyvals; - orte_jobid_t jobid; - size_t i, j, k; - int rc; - - OPAL_TRACE(1); - - /* stupid ISO C forbids conversion of object pointer to function - pointer. So we do this, which is the same thing, but without - the warning from GCC */ - cbfunc_union.ptr = cbdata; - cbfunc = cbfunc_union.func; - - /* we made sure in the subscriptions that at least one - * value is always returned - * get the jobid from the segment name in the first value - */ - values = (orte_gpr_value_t**)(data->values)->addr; - if (ORTE_SUCCESS != (rc = - orte_schema.extract_jobid_from_segment_name(&jobid, - values[0]->segment))) { - ORTE_ERROR_LOG(rc); - return; - } - - for(i = 0, k=0; k < data->cnt && - i < (data->values)->size; i++) { - if (NULL != values[i]) { - k++; - value = values[i]; - /* determine the state change */ - keyvals = value->keyvals; - for(j=0; jcnt; j++) { - orte_gpr_keyval_t* keyval = keyvals[j]; - if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_INIT); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG2); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG3) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG3); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_FINALIZED) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_FINALIZED); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_TERMINATED) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_TERMINATED); - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NUM_ABORTED) == 0) { - (*cbfunc)(jobid,ORTE_PROC_STATE_ABORTED); - continue; - } - } - } - } -} - - -/** - * define a callback point for completing the wireup of the stdin for io forwarding - */ -static void orte_rmgr_tbird_wireup_callback(orte_gpr_notify_data_t *data, void *cbdata) -{ - orte_gpr_value_t **values; - orte_jobid_t jobid; - int rc; - - OPAL_TRACE(1); - - /* we made sure in the subscriptions that at least one - * value is always returned - * get the jobid from the segment name in the first value - */ - values = (orte_gpr_value_t**)(data->values)->addr; - if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_segment_name(&jobid, values[0]->segment))) { - ORTE_ERROR_LOG(rc); - return; - } - orte_rmgr_tbird_wireup_stdin(jobid); -} - -/* - * Shortcut for the multiple steps involved in spawning a new job. - */ - - -static int orte_rmgr_tbird_spawn( - orte_app_context_t** app_context, - size_t num_context, - orte_jobid_t* jobid, - orte_rmgr_cb_fn_t cbfunc, - orte_proc_state_t cb_conditions) -{ - int rc; - orte_process_name_t* name; - - OPAL_TRACE(1); - - /* - * Perform resource discovery. - */ - if (mca_rmgr_tbird_component.tbird_rds == false && - ORTE_SUCCESS != (rc = orte_rds_base_query())) { - ORTE_ERROR_LOG(rc); - return rc; - } else { - mca_rmgr_tbird_component.tbird_rds = true; - } - - /* - * Initialize job segment and allocate resources - */ /* JJH Insert C/N mapping stuff here */ - if (ORTE_SUCCESS != - (rc = orte_rmgr_tbird_create(app_context,num_context,jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_allocate(*jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_map(*jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* - * setup I/O forwarding - */ - - if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, *jobid, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDOUT, 1))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDERR, 2))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* setup the launch system's stage gate counters and subscriptions */ - if (ORTE_SUCCESS != - (rc = orte_rmgr_base_proc_stage_gate_init(*jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** setup the subscription so we can complete the wireup when all processes reach LAUNCHED */ - rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_wireup_callback, NULL, ORTE_PROC_STATE_LAUNCHED); - if(ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* - * setup callback - */ - - if(NULL != cbfunc) { - union { - orte_rmgr_cb_fn_t func; - void * ptr; - } cbfunc_union; - void *cbdata; - - /* stupid ISO C forbids conversion of object pointer to function - pointer. So we do this, which is the same thing, but without - the warning from GCC */ - cbfunc_union.func = cbfunc; - cbdata = cbfunc_union.ptr; - - rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_tbird_callback, cbdata, cb_conditions); - if(ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - - /* - * launch the job - */ - if (ORTE_SUCCESS != (rc = orte_rmgr_tbird_launch(*jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - orte_ns.free_name(&name); - return ORTE_SUCCESS; -} - - -static int orte_rmgr_tbird_finalize(void) -{ - int rc; - - OPAL_TRACE(1); - - /** - * Finalize Process Launch Subsystem (PLS) - */ - if (ORTE_SUCCESS != (rc = orte_pls_base_finalize())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Finalize Resource Mapping Subsystem (RMAPS) - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_finalize())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Finalize Resource Allocation Subsystem (RAS) - */ - if (ORTE_SUCCESS != (rc = orte_ras_base_finalize())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Finalize Resource Discovery Subsystem (RDS) - */ - if (ORTE_SUCCESS != (rc = orte_rds_base_finalize())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* Cancel pending receive. */ - - orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_SVC); - - return ORTE_SUCCESS; -} - diff --git a/orte/mca/rmgr/tbird/rmgr_tbird.h b/orte/mca/rmgr/tbird/rmgr_tbird.h deleted file mode 100644 index 37493a43d3..0000000000 --- a/orte/mca/rmgr/tbird/rmgr_tbird.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Universal Resource Manager (tbird) - */ -#ifndef ORTE_RMGR_tbird_H -#define ORTE_RMGR_tbird_H - -#include "orte/mca/rmgr/rmgr.h" - -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif - -/** -* tbird component structure -- add some stuff beyond what is in the -* normal rmgr component. -*/ -struct orte_rmgr_tbird_component_t { - /** Base rmgr component */ - orte_rmgr_base_component_t super; - /** Has RDS query been called */ - bool tbird_rds; - /** Selected ras module */ - orte_ras_base_module_t *tbird_ras; - /** Selected rmaps module */ - orte_rmaps_base_module_t *tbird_rmaps; - /** Selected pls module */ - orte_pls_base_module_t *tbird_pls; -}; -/** Convenience typedef */ -typedef struct orte_rmgr_tbird_component_t orte_rmgr_tbird_component_t; - -/** Global tbird component */ -OMPI_COMP_EXPORT extern orte_rmgr_tbird_component_t mca_rmgr_tbird_component; -/** Global tbird module */ -OMPI_COMP_EXPORT extern orte_rmgr_base_module_t orte_rmgr_tbird_module; - -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif - -#endif diff --git a/orte/mca/rmgr/tbird/rmgr_tbird_component.c b/orte/mca/rmgr/tbird/rmgr_tbird_component.c deleted file mode 100644 index ec1f5a6620..0000000000 --- a/orte/mca/rmgr/tbird/rmgr_tbird_component.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/orte_constants.h" -#include "orte/util/proc_info.h" -#include "opal/util/output.h" -#include "orte/dss/dss_types.h" -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/rds/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "orte/mca/ras/base/base.h" -#include "orte/mca/rmaps/base/base.h" -#include "orte/mca/pls/base/base.h" -#include "orte/mca/rmgr/base/base.h" -#include "orte/mca/rml/rml.h" -#include "rmgr_tbird.h" - -/* - * Local functions - */ - -static int orte_rmgr_tbird_open(void); -static int orte_rmgr_tbird_close(void); -static orte_rmgr_base_module_t* orte_rmgr_tbird_init(int *priority); - - -orte_rmgr_tbird_component_t mca_rmgr_tbird_component = { - { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - { - /* Indicate that we are a iof v1.0.0 component (which also - implies a specific MCA version) */ - - ORTE_RMGR_BASE_VERSION_1_0_0, - - "tbird", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_rmgr_tbird_open, /* component open */ - orte_rmgr_tbird_close /* component close */ - }, - - /* Next the MCA v1.0.0 component meta data */ - { - /* Whether the component is checkpointable or not */ - false - }, - - orte_rmgr_tbird_init - } -}; - - -/** - * component open/close/init function - */ -static int orte_rmgr_tbird_open(void) -{ - int rc; - - /** - * Open Resource Discovery Subsystem (RDS) - */ - if (ORTE_SUCCESS != (rc = orte_rds_base_open())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Open Resource Allocation Subsystem (RAS) - */ - if (ORTE_SUCCESS != (rc = orte_ras_base_open())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Open Resource Mapping Subsystem (RMAPS) - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_open())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Open Process Launch Subsystem (PLS) - */ - if (ORTE_SUCCESS != (rc = orte_pls_base_open())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - - -static void orte_rmgr_tbird_recv( - int status, - orte_process_name_t* peer, - orte_buffer_t* req, - orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - orte_buffer_t rsp; - OBJ_CONSTRUCT(&rsp, orte_buffer_t); - - if (ORTE_SUCCESS != (rc = orte_rmgr_base_cmd_dispatch(req,&rsp))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - rc = orte_rml.send_buffer(peer, &rsp, ORTE_RML_TAG_RMGR_CLNT, 0); - if (rc < 0) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - -cleanup: - OBJ_DESTRUCT(&rsp); -} - - -static orte_rmgr_base_module_t *orte_rmgr_tbird_init(int* priority) -{ - int rc; - char* pls = NULL; - if(orte_process_info.seed == false) { - /* if we are bootproxy - need to be selected */ - int id = mca_base_param_register_int("rmgr","bootproxy","jobid",NULL,0); - int jobid = 0; - mca_base_param_lookup_int(id,&jobid); - if(jobid == 0) { - return NULL; - } - /* use fork pls for bootproxy */ - id = mca_base_param_register_string("rmgr","bootproxy","pls",NULL,"fork"); - mca_base_param_lookup_string(id,&pls); - } - - /** - * Select RDS components. - */ - if (ORTE_SUCCESS != (rc = orte_rds_base_select())) { - ORTE_ERROR_LOG(rc); - return NULL; - } - mca_rmgr_tbird_component.tbird_rds = false; - - /** - * Find available RAS components - */ - if (ORTE_SUCCESS != (rc = orte_ras_base_find_available())) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return NULL; - } - - /** - * Select RMAPS component - */ - if (NULL == (mca_rmgr_tbird_component.tbird_rmaps = orte_rmaps_base_select(NULL))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return NULL; - } - - /** - * Select PLS component - */ - if (NULL == (mca_rmgr_tbird_component.tbird_pls = orte_pls_base_select(pls))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return NULL; - } - - /* Post non-blocking receive */ - - if (0 > (rc = orte_rml.recv_buffer_nb( - ORTE_RML_NAME_ANY, - ORTE_RML_TAG_RMGR_SVC, - ORTE_RML_PERSISTENT, - orte_rmgr_tbird_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - return NULL; - } - - *priority = 100; - return &orte_rmgr_tbird_module; -} - - -/** - * Close all subsystems. - */ -static int orte_rmgr_tbird_close(void) -{ - int rc; - - /** - * Close Process Launch Subsystem (PLS) - */ - if (ORTE_SUCCESS != (rc = orte_pls_base_close())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Close Resource Mapping Subsystem (RMAPS) - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_close())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Close Resource Allocation Subsystem (RAS) - */ - if (ORTE_SUCCESS != (rc = orte_ras_base_close())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** - * Close Resource Discovery Subsystem (RDS) - */ - if (ORTE_SUCCESS != (rc = orte_rds_base_close())) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -}