1
1

Add ORTE ALPS support (Cray XT CNL)

This commit was SVN r17482.
Этот коммит содержится в:
Galen Shipman 2008-02-17 19:29:06 +00:00
родитель cec3d96a94
Коммит 18d1d3b408
23 изменённых файлов: 1838 добавлений и 1 удалений

49
orte/mca/pls/alps/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
pls_alps.h \
pls_alps_component.c \
pls_alps_module.c
dist_pkgdata_DATA = help-pls-alps.txt
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_alps_DSO
component_noinst =
component_install = mca_pls_alps.la
else
component_noinst = libmca_pls_alps.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pls_alps_la_SOURCES = $(sources)
mca_pls_alps_la_LDFLAGS = -module -avoid-version
mca_pls_alps_la_LIBADD = \
$(top_ompi_builddir)/orte/libopen-rte.la \
$(top_ompi_builddir)/opal/libopen-pal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_pls_alps_la_SOURCES =$(sources)
libmca_pls_alps_la_LDFLAGS = -module -avoid-version

25
orte/mca/pls/alps/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pls_alps_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_alps_CONFIG],[
OMPI_CHECK_ALPS([pls_alps], [$1], [$2])
])dnl

23
orte/mca/pls/alps/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

42
orte/mca/pls/alps/help-pls-alps.txt Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[multiple-prefixes]
The ALPS process starter for Open MPI does not support multiple
different --prefix options to mpirun. You can specify at most one
unique value for the --prefix option (in any of the application
contexts); it will be applied to all the application contexts of your
parallel job.
Put simply, you must have Open MPI installed in the same location on
all of your ALPS nodes.
Multiple different --prefix options were specified to mpirun. This is
a fatal error for the ALPS process starter in Open MPI.
The first two prefix values supplied were:
%s
and %s
#
[no-hosts-in-list]
The ALPS process starter for Open MPI didn't find any hosts in
the map for this application. This can be caused by a lack of
an allocation, or by an error in the Open MPI code. Please check
to ensure you have a ALPS allocation. If you do, then please pass
the error to the Open MPI user's mailing list for assistance.

54
orte/mca/pls/alps/pls_alps.h Обычный файл
Просмотреть файл

@ -0,0 +1,54 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_PLS_ALPS_EXPORT_H
#define ORTE_PLS_ALPS_EXPORT_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/mca/pls/pls.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct orte_pls_alps_component_t {
orte_pls_base_component_t super;
int priority;
int debug;
bool timing;
char *orted;
char *custom_args;
};
typedef struct orte_pls_alps_component_t orte_pls_alps_component_t;
/*
* Globally exported variable
*/
ORTE_MODULE_DECLSPEC extern orte_pls_alps_component_t
mca_pls_alps_component;
ORTE_DECLSPEC extern orte_pls_base_module_t
orte_pls_alps_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_PLS_ALPS_EXPORT_H */

167
orte/mca/pls/alps/pls_alps_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,167 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/orte_constants.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
#include "pls_alps.h"
/*
* Public string showing the pls ompi_alps component version number
*/
const char *mca_pls_alps_component_version_string =
"Open MPI alps pls MCA component version " ORTE_VERSION;
/*
* Local functions
*/
static int pls_alps_open(void);
static int pls_alps_close(void);
static orte_pls_base_module_t *pls_alps_init(int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_pls_alps_component_t mca_pls_alps_component = {
{
/* First, the mca_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a pls v1.3.0 component (which also
implies a specific MCA version) */
ORTE_PLS_BASE_VERSION_1_3_0,
/* Component name and version */
"alps",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
pls_alps_open,
pls_alps_close
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Initialization / querying functions */
pls_alps_init
}
/* Other orte_pls_alps_component_t items -- left uninitialized
here; will be initialized in pls_alps_open() */
};
static int pls_alps_open(void)
{
mca_base_component_t *comp = &mca_pls_alps_component.super.pls_version;
int tmp, value;
mca_base_param_reg_int(comp, "debug", "Enable debugging of alps pls",
false, false, 0,
&mca_pls_alps_component.debug);
if (mca_pls_alps_component.debug == 0) {
mca_base_param_reg_int_name("orte", "debug",
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
false, false, false, &mca_pls_alps_component.debug);
}
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75,
&mca_pls_alps_component.priority);
mca_base_param_reg_string(comp, "orted",
"Command to use to start proxy orted",
false, false, "orted",
&mca_pls_alps_component.orted);
tmp = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
mca_pls_alps_component.timing = true;
} else {
mca_pls_alps_component.timing = false;
}
mca_base_param_reg_string(comp, "args",
"Custom arguments to srun",
false, false, NULL,
&mca_pls_alps_component.custom_args);
return ORTE_SUCCESS;
}
static orte_pls_base_module_t *pls_alps_init(int *priority)
{
/* if we are NOT an HNP, then don't select us */
if (!orte_process_info.seed) {
return NULL;
}
*priority = mca_pls_alps_component.priority;
return &orte_pls_alps_module;
}
static int pls_alps_close(void)
{
if (NULL != mca_pls_alps_component.orted) {
free(mca_pls_alps_component.orted);
}
if (NULL != mca_pls_alps_component.custom_args) {
free(mca_pls_alps_component.custom_args);
}
return ORTE_SUCCESS;
}

630
orte/mca/pls/alps/pls_alps_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,630 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <signal.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/util/show_help.h"
#include "opal/util/basename.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/runtime/params.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
#include "pls_alps.h"
/*
* Local functions
*/
static int pls_alps_launch_job(orte_jobid_t jobid);
static int pls_alps_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_alps_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int pls_alps_terminate_proc(const orte_process_name_t *name);
static int pls_alps_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_alps_signal_proc(const orte_process_name_t *name, int32_t signal);
static int pls_alps_finalize(void);
static int pls_alps_start_proc(int argc, char **argv, char **env,
char *prefix);
/*
* Global variable
*/
orte_pls_base_module_1_3_0_t orte_pls_alps_module = {
pls_alps_launch_job,
pls_alps_terminate_job,
pls_alps_terminate_orteds,
pls_alps_terminate_proc,
pls_alps_signal_job,
pls_alps_signal_proc,
pls_alps_finalize
};
/*
* Local variables
*/
static pid_t alps_pid = 0;
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
static bool failed_launch;
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int pls_alps_launch_job(orte_jobid_t jobid)
{
orte_job_map_t *map = NULL;
opal_list_item_t *item;
size_t num_nodes;
char *jobid_string = NULL;
char *param;
char **argv = NULL;
int argc;
int rc;
char *tmp;
char** env = NULL;
char* var;
char *nodelist_flat;
char **nodelist_argv;
int nodelist_argc;
orte_process_name_t name;
char *name_string;
char **custom_strings;
int num_args, i;
char *cur_prefix;
struct timeval joblaunchstart, launchstart, launchstop;
int proc_name_index = 0;
if (mca_pls_alps_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "pls_alps: could not obtain job start time");
}
}
/* save the active jobid */
active_job = jobid;
/* indicate the state of the launch */
failed_launch = true;
/* Query the map for this job.
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
* - need to know if we are launching on a subset of the allocated nodes
* All other mapping responsibilities fall to orted in the fork PLS
*/
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_nodes = map->num_new_daemons;
if (num_nodes == 0) {
/* no new daemons required - just launch apps */
goto launch_apps;
}
/* need integer value for command line parameter */
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
/*
* start building argv array
*/
argv = NULL;
argc = 0;
/*
* ALPS aprun OPTIONS
*/
/* add the aprun command */
opal_argv_append(&argc, &argv, "aprun");
/* Append user defined arguments to aprun */
if ( NULL != mca_pls_alps_component.custom_args ) {
custom_strings = opal_argv_split(mca_pls_alps_component.custom_args, ' ');
num_args = opal_argv_count(custom_strings);
for (i = 0; i < num_args; ++i) {
opal_argv_append(&argc, &argv, custom_strings[i]);
}
opal_argv_free(custom_strings);
}
/* number of processors needed */
asprintf(&tmp, "-n %lu", (unsigned long) num_nodes);
opal_argv_append(&argc, &argv, tmp);
free(tmp);
opal_argv_append(&argc, &argv, "-N 1");
/* create nodelist */
nodelist_argv = NULL;
nodelist_argc = 0;
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
/* if the daemon already exists on this node, then
* don't include it
*/
if (node->daemon_preexists) {
continue;
}
/* otherwise, add it to the list of nodes upon which
* we need to launch a daemon
*/
opal_argv_append(&nodelist_argc, &nodelist_argv, node->nodename);
}
if (0 == opal_argv_count(nodelist_argv)) {
opal_show_help("help-pls-alps.txt", "no-hosts-in-list", true);
rc = ORTE_ERR_FAILED_TO_START;
goto cleanup;
}
nodelist_flat = opal_argv_join(nodelist_argv, ',');
opal_argv_free(nodelist_argv);
asprintf(&tmp, "-L %s", nodelist_flat);
opal_argv_append(&argc, &argv, tmp);
free(tmp);
/*
* ORTED OPTIONS
*/
/* add the daemon command (as specified by user) */
opal_argv_append(&argc, &argv, mca_pls_alps_component.orted);
/* ensure we don't lose contact */
orte_no_daemonize_flag = true;
/* Add basic orted command line options, including debug flags */
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
NULL);
/* force orted to use the alps sds */
opal_argv_append(&argc, &argv, "--ns-nds");
opal_argv_append(&argc, &argv, "alps");
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
*/
name.jobid = 0;
name.vpid = map->daemon_vpid_start;
rc = orte_ns.get_proc_name_string(&name_string, &name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls_alps: unable to create process name");
goto cleanup;
}
free(argv[proc_name_index]);
argv[proc_name_index] = strdup(name_string);
free(name_string);
if (mca_pls_alps_component.debug) {
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:alps: final top-level argv:");
opal_output(0, "pls:alps: %s", param);
free(param);
}
}
/* Copy the prefix-directory specified in the
corresponding app_context. If there are multiple,
different prefix's in the app context, complain (i.e., only
allow one --prefix option for the entire alps run -- we
don't support different --prefix'es for different nodes in
the ALPS pls) */
cur_prefix = NULL;
for (i=0; i < map->num_apps; i++) {
char * app_prefix_dir = map->apps[i]->prefix_dir;
/* Check for already set cur_prefix_dir -- if different,
complain */
if (NULL != app_prefix_dir) {
if (NULL != cur_prefix &&
0 != strcmp (cur_prefix, app_prefix_dir)) {
opal_show_help("help-pls-alps.txt", "multiple-prefixes",
true, cur_prefix, app_prefix_dir);
return ORTE_ERR_FATAL;
}
/* If not yet set, copy it; iff set, then it's the
same anyway */
if (NULL == cur_prefix) {
cur_prefix = strdup(app_prefix_dir);
if (mca_pls_alps_component.debug) {
opal_output (0, "pls:alps: Set prefix:%s",
cur_prefix);
}
}
}
}
/* setup environment */
env = opal_argv_copy(environ);
/* purge it of any params not for orteds */
orte_pls_base_purge_mca_params(&env);
/* add the nodelist */
var = mca_base_param_environ_variable("orte", "alps", "nodelist");
opal_setenv(var, nodelist_flat, true, &env);
free(nodelist_flat);
free(var);
if (mca_pls_alps_component.timing) {
if (0 != gettimeofday(&launchstart, NULL)) {
opal_output(0, "pls_alps: could not obtain start time");
}
}
/* exec the daemon(s) */
if (ORTE_SUCCESS != (rc = pls_alps_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* do NOT wait for alps to complete. Alps only completes when the processes
* it starts - in this case, the orteds - complete. Instead, we'll catch
* any alps failures and deal with them elsewhere
*/
/* wait for daemons to callback */
if (ORTE_SUCCESS != (rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
launch_apps:
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* declare the launch a success */
failed_launch = false;
if (mca_pls_alps_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_alps: could not obtain stop time");
} else {
opal_output(0, "pls_alps: daemon block launch time is %ld usec",
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
(launchstop.tv_usec - launchstart.tv_usec));
opal_output(0, "pls_alps: total job launch time is %ld usec",
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(launchstop.tv_usec - joblaunchstart.tv_usec));
}
}
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:alps: start_procs returned error %d", rc);
goto cleanup;
}
/* JMS: short we stash the alps pid in the gpr somewhere for cleanup? */
cleanup:
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != env) {
opal_argv_free(env);
}
if(NULL != jobid_string) {
free(jobid_string);
}
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_pls_base_daemon_failed(jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
}
return rc;
}
static int pls_alps_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
{
int rc;
/* order them to kill their local procs for this job */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Terminate the orteds for a given job
*/
static int pls_alps_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
/* deregister the waitpid callback to ensure we don't make it look like
* alps failed when it didn't. Since the alps may have already completed,
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
* messages
*/
orte_wait_cb_cancel(alps_pid);
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* The way we've used ALPS, we can't kill individual processes --
* we'll kill the entire job
*/
static int pls_alps_terminate_proc(const orte_process_name_t *name)
{
opal_output(0, "pls:alps:terminate_proc: not supported");
return ORTE_ERR_NOT_SUPPORTED;
}
/**
* Signal all the processes in the child alps by sending the signal directly to it
*/
static int pls_alps_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
{
if (0 != alps_pid) {
kill(alps_pid, (int)signal);
}
return ORTE_SUCCESS;
}
/*
* Signal a specific process
*/
static int pls_alps_signal_proc(const orte_process_name_t *name, int32_t signal)
{
opal_output(0, "pls:alps:signal_proc: not supported");
return ORTE_ERR_NOT_SUPPORTED;
}
static int pls_alps_finalize(void)
{
int rc;
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
}
return ORTE_SUCCESS;
}
static void alps_wait_cb(pid_t pid, int status, void* cbdata){
/* According to the ALPS folks, alps always returns the highest exit
code of our remote processes. Thus, a non-zero exit status doesn't
necessarily mean that alps failed - it could be that an orted returned
a non-zero exit status. Of course, that means the orted failed(!), so
the end result is the same - the job didn't start.
As a result, we really can't do much with the exit status itself - it
could be something in errno (if alps itself failed), or it could be
something returned by an orted, or it could be something returned by
the OS (e.g., couldn't find the orted binary). Somebody is welcome
to sort out all the options and pretty-print a better error message. For
now, though, the only thing that really matters is that
alps failed. Report the error and make sure that orterun
wakes up - otherwise, do nothing!
*/
if (0 != status) {
if (failed_launch) {
/* we have a problem during launch */
opal_output(0, "ERROR: alps failed to start the required daemons.");
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
/* report that the daemon has failed so we break out of the daemon
* callback receive and exit
*/
orte_pls_base_daemon_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
} else {
/* an orted must have died unexpectedly after launch - report
* that the daemon has failed so we exit
*/
orte_pls_base_daemon_failed(active_job, false, pid, status, ORTE_JOB_STATE_ABORTED);
}
}
}
static int pls_alps_start_proc(int argc, char **argv, char **env,
char *prefix)
{
int fd;
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
if (NULL == exec_argv) {
return ORTE_ERR_NOT_FOUND;
}
alps_pid = fork();
if (-1 == alps_pid) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
if (0 == alps_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL;
/* Figure out the basenames for the libdir and bindir. There
is a lengthy comment about this in pls_rsh_module.c
explaining all the rationale for how / why we're doing
this. */
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. */
if (NULL != prefix) {
char *oldenv, *newenv;
/* Reset PATH */
oldenv = getenv("PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix, bin_base);
}
opal_setenv("PATH", newenv, true, &env);
if (mca_pls_alps_component.debug) {
opal_output(0, "pls:alps: reset PATH: %s", newenv);
}
free(newenv);
/* Reset LD_LIBRARY_PATH */
oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix, lib_base);
}
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
if (mca_pls_alps_component.debug) {
opal_output(0, "pls:alps: reset LD_LIBRARY_PATH: %s",
newenv);
}
free(newenv);
}
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
if(fd > 0) {
dup2(fd, 0);
}
/* When not in debug mode and --debug-daemons was not passed,
* tie stdout/stderr to dev null so we don't see messages from orted */
if (0 == mca_pls_alps_component.debug && !orte_debug_daemons_flag) {
if (fd >= 0) {
if (fd != 1) {
dup2(fd,1);
}
if (fd != 2) {
dup2(fd,2);
}
}
}
if (fd > 2) {
close(fd);
}
/* get the alps process out of orterun's process group so that
signals sent from the shell (like those resulting from
cntl-c) don't get sent to alps */
setpgid(0, 0);
char* param = opal_argv_join(argv, ';');
execve(exec_argv, argv, env);
opal_output(0, "pls:alps:start_proc: exec failed");
/* don't return - need to exit - returning would be bad -
we're not in the calling process anymore */
exit(1);
} else { /* parent */
/* just in case, make sure that the alps process is not in our
process group any more. Stevens says always do this on both
sides of the fork... */
setpgid(alps_pid, alps_pid);
/* setup the waitpid so we can find out if alps succeeds! */
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
free(exec_argv);
}
return ORTE_SUCCESS;
}

58
orte/mca/ras/alps/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,58 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(ras_alps_CPPFLAGS)
dist_pkgdata_DATA = help-ras-alps.txt
sources = \
ras_alps.h \
ras_alps_component.c \
ras_alps_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_ras_alps_DSO
lib =
lib_sources =
component = mca_ras_alps.la
component_sources = $(sources)
else
lib = libmca_ras_alps.la
lib_sources = $(sources)
component =
component_sources =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component)
mca_ras_alps_la_SOURCES = $(component_sources)
mca_ras_alps_la_LDFLAGS = -module -avoid-version $(ras_alps_LDFLAGS)
mca_ras_alps_la_LIBADD = \
$(ras_alps_LIBS) \
$(top_ompi_builddir)/orte/libopen-rte.la \
$(top_ompi_builddir)/opal/libopen-pal.la
noinst_LTLIBRARIES = $(lib)
libmca_ras_alps_la_SOURCES = $(lib_sources)
libmca_ras_alps_la_LDFLAGS = -module -avoid-version $(ras_alps_LDFLAGS)
libmca_ras_alps_la_LIBADD = $(ras_alps_LIBS)

25
orte/mca/ras/alps/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ras_alps_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_ras_alps_CONFIG],[
OMPI_CHECK_ALPS([ras_alps], [$1], [$2])
])dnl

23
orte/mca/ras/alps/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

43
orte/mca/ras/alps/help-ras-alps.txt Обычный файл
Просмотреть файл

@ -0,0 +1,43 @@
# -*- text -*-
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI MCA error messages.
#
[alps-env-var-not-found]
While trying to determine what resources are available, the ALPS
resource allocator expects to find the following environment variables:
BATCH_PARTITION_ID
However, it was unable to find the following environment variable:
%s
#This is a fatal error.
[alps-env-var-bad-value]
While trying to determine what resources are available, the ALPS
resource allocator uses the following environment variables:
ALPS_NODELIST value: %s
ALPS_TASKS_PER_NODE value: %s
However, an error was encountered when trying to parse the following variable:
%s
#This is a fatal error.

41
orte/mca/ras/alps/ras_alps.h Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Resource Allocation (ALPS)
*/
#ifndef ORTE_RAS_ALPS_H
#define ORTE_RAS_ALPS_H
#include "orte/mca/ras/ras.h"
#include "orte/mca/ras/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component;
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

109
orte/mca/ras/alps/ras_alps_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,109 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/orte_constants.h"
#include "orte/util/proc_info.h"
#include "ras_alps.h"
/*
* Local variables
*/
static int param_priority;
/*
* Local functions
*/
static int ras_alps_open(void);
static orte_ras_base_module_t *ras_alps_init(int*);
orte_ras_base_component_t mca_ras_alps_component = {
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a ras v1.3.0 component (which also
implies a specific MCA version) */
ORTE_RAS_BASE_VERSION_1_3_0,
/* Component name and version */
"alps",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
ras_alps_open,
NULL
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
ras_alps_init
};
static int ras_alps_open(void)
{
param_priority =
mca_base_param_reg_int(&mca_ras_alps_component.ras_version,
"priority",
"Priority of the alps ras component",
false, false, 75, NULL);
return ORTE_SUCCESS;
}
static orte_ras_base_module_t *ras_alps_init(int* priority)
{
/* if we are not an HNP, then we must not be selected */
if (!orte_process_info.seed) {
return NULL;
}
/* Are we running under a ALPS job? */
if (NULL != getenv("BATCH_PARTITION_ID")) {
mca_base_param_lookup_int(param_priority, priority);
opal_output(orte_ras_base.ras_output,
"ras:alps: available for selection");
return &orte_ras_alps_module;
}
/* Sadly, no */
opal_output(orte_ras_base.ras_output,
"ras:alps: NOT available for selection");
return NULL;
}

140
orte/mca/ras/alps/ras_alps_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,140 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/ras_private.h"
#include "ras_alps.h"
/*
* Local functions
*/
static int orte_ras_alps_allocate(orte_jobid_t jobid, opal_list_t *attributes);
static int orte_ras_alps_deallocate(orte_jobid_t jobid);
static int orte_ras_alps_finalize(void);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_alps_module = {
orte_ras_alps_allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
orte_ras_base_proc_query_alloc,
orte_ras_alps_deallocate,
orte_ras_alps_finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
static int orte_ras_alps_allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
int ret;
char *alps_batch_id;
opal_list_t nodes;
opal_list_item_t* item;
char *alps_node_cmd_str = "apstat -a `apstat -r | grep $BATCH_PARTITION_ID | awk '{print $2}'` "
" -r -v | egrep \"(nid [0-9]+)\" -o | awk '{print $2}' > ./ompi_ras_alps_node_file";
alps_batch_id = getenv("BATCH_PARTITION_ID");
if (NULL == alps_batch_id) {
opal_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1,
"BATCH_PARTITION_ID");
return ORTE_ERR_NOT_FOUND;
}
if(system(alps_node_cmd_str)) {
opal_output(0, "Error in orte_ras_alps_allocate: system call returned an error, for reference I tried to run: %s",
alps_node_cmd_str);
return ORTE_ERROR;
}
OBJ_CONSTRUCT(&nodes, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_ras_base_read_nodename_file(&nodes, "./ompi_ras_alps_node_file"))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
ret = orte_ras_base_node_insert(&nodes);
cleanup:
while (NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
/* All done */
if (ORTE_SUCCESS == ret) {
opal_output(orte_ras_base.ras_output,
"ras:alps:allocate: success");
} else {
opal_output(orte_ras_base.ras_output,
"ras:alps:allocate: failure (base_allocate_nodes=%d)", ret);
}
return ret;
}
/*
* There's really nothing to do here
*/
static int orte_ras_alps_deallocate(orte_jobid_t jobid)
{
opal_output(orte_ras_base.ras_output,
"ras:alps:deallocate: success (nothing to do)");
return ORTE_SUCCESS;
}
/*
* There's really nothing to do here
*/
static int orte_ras_alps_finalize(void)
{
opal_output(orte_ras_base.ras_output,
"ras:alps:finalize: success (nothing to do)");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -118,3 +119,65 @@ CLEANUP:
return ORTE_SUCCESS;
}
#define RAS_BASE_FILE_MAX_LINE_LENGTH 512
static char *ras_getline(FILE *fp)
{
char *ret, *buff = NULL;
char input[RAS_BASE_FILE_MAX_LINE_LENGTH];
ret = fgets(input, RAS_BASE_FILE_MAX_LINE_LENGTH, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
}
return buff;
}
int orte_ras_base_read_nodename_file(opal_list_t *nodes, char *filename)
{
FILE *fp;
int32_t nodeid=0;
orte_ras_node_t *node=NULL;
char *hostname;
fp = fopen(filename, "r");
if (NULL == fp) {
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
while (NULL != (hostname = ras_getline(fp))) {
opal_output(orte_ras_base.ras_output,
"ras:base:read_nodename: got hostname %s", hostname);
/* if this matches the prior nodename, then just add
* to the slot count
*/
if (NULL != node &&
0 == strcmp(node->node_name, hostname)) {
++node->node_slots;
/* free the hostname that came back since we don't need it */
free(hostname);
continue;
}
/* must be a new name, so add a new item to the list */
opal_output(orte_ras_base.ras_output,
"ras:base:read_nodename: not found -- added to list");
node = OBJ_NEW(orte_ras_node_t);
node->node_name = hostname;
node->launch_id = nodeid;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = 1;
opal_list_append(nodes, &node->super);
/* up the nodeid */
nodeid++;
}
fclose(fp);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -86,6 +87,8 @@ ORTE_DECLSPEC int orte_ras_base_set_oversubscribe_override(orte_jobid_t job);
ORTE_DECLSPEC int orte_ras_base_get_oversubscribe_override(orte_jobid_t job, bool *flag);
ORTE_DECLSPEC int orte_ras_base_read_nodename_file(opal_list_t *nodes, char *filename);
/*
* Query the registry for all available nodes
*/

51
orte/mca/sds/alps/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,51 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
sources = \
sds_alps.h \
sds_alps_component.c \
sds_alps_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sds_alps_DSO
component_noinst =
component_install = mca_sds_alps.la
else
component_noinst = libmca_sds_alps.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sds_alps_la_SOURCES = $(sources)
mca_sds_alps_la_LDFLAGS = -module -avoid-version
mca_sds_alps_la_LIBADD = \
$(top_ompi_builddir)/orte/libopen-rte.la \
$(top_ompi_builddir)/opal/libopen-pal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_sds_alps_la_SOURCES =$(sources)
libmca_sds_alps_la_LDFLAGS = -module -avoid-version

27
orte/mca/sds/alps/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,27 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sds_alps_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sds_alps_CONFIG],[
OMPI_CHECK_ALPS([sds_alps],
[AC_CHECK_FUNC([cnos_get_rank], [$1], [$2])],
[$2])
])dnl

23
orte/mca/sds/alps/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2008 UT-Battelle, LLC
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

50
orte/mca/sds/alps/sds_alps.h Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_SDS_ALPS_H
#define ORTE_SDS_ALPS_H
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_sds_alps_component_open(void);
int orte_sds_alps_component_close(void);
orte_sds_base_module_t* orte_sds_alps_component_init(int *priority);
/*
* Startup / Shutdown
*/
int orte_sds_alps_finalize(void);
/*
* Module functions
*/
int orte_sds_alps_set_name(void);
int orte_sds_alps_contact_universe(void);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_SDS_ALPS_H */

106
orte/mca/sds/alps/sds_alps_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,106 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/sds/sds.h"
#include "orte/mca/sds/alps/sds_alps.h"
#include "opal/mca/base/mca_base_param.h"
extern orte_sds_base_module_t orte_sds_alps_module;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_sds_base_component_t mca_sds_alps_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a sds v1.0.0 component (which also
implies a specific MCA version) */
ORTE_SDS_BASE_VERSION_1_0_0,
/* Component name and version */
"alps",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_sds_alps_component_open,
orte_sds_alps_component_close
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
/* Initialization / querying functions */
orte_sds_alps_component_init
};
int
orte_sds_alps_component_open(void)
{
return ORTE_SUCCESS;
}
orte_sds_base_module_t *
orte_sds_alps_component_init(int *priority)
{
int id;
char *mode;
/* okay, not seed/singleton attempt another approach */
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &mode);
if (NULL == mode || 0 != strcmp("alps", mode)) {
if (NULL != mode) {
free(mode);
}
return NULL;
}
if (NULL != mode) {
free(mode);
}
*priority = 35;
return &orte_sds_alps_module;
}
int
orte_sds_alps_component_close(void)
{
return ORTE_SUCCESS;
}

84
orte/mca/sds/alps/sds_alps_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,84 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include <catamount/cnos_mpi_os.h>
#include "orte/orte_constants.h"
#include "orte/mca/sds/sds.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/sds/alps/sds_alps.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/errmgr/base/base.h"
orte_sds_base_module_t orte_sds_alps_module = {
orte_sds_base_basic_contact_universe,
orte_sds_alps_set_name,
orte_sds_alps_finalize,
};
int
orte_sds_alps_set_name(void)
{
int rc;
orte_jobid_t jobid;
orte_vpid_t vpid;
if(orte_process_info.seed) {
if (ORTE_SUCCESS != (rc = orte_ns.create_my_name())) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_process_info.num_procs = 1;
return rc;
}
/* Get our process information
*
* we're going to make up the jobid. find our vpid,
* assuming range starts at 0
*/
jobid = 0; /* not 0, since it has special meaning */
vpid = (orte_vpid_t) cnos_get_rank() + 1;
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
jobid,
vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
return ORTE_SUCCESS;
}
int
orte_sds_alps_finalize(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -89,7 +90,7 @@ orte_sds_cnos_component_init(int *priority)
return NULL;
}
*priority = 60;
*priority = 30;
return &orte_sds_cnos_module;
}