First cut at revamping bproc support to separate it out from LANL's configuration.
First cut at adding support for LSF Lots of ompi_ignores so only Jeff and I will see this stuff This commit was SVN r15321.
Этот коммит содержится в:
родитель
1d02b9e7b5
Коммит
a1bf04f39e
@ -66,6 +66,7 @@ m4_include(config/ompi_check_icc.m4)
|
||||
m4_include(config/ompi_check_gm.m4)
|
||||
m4_include(config/ompi_check_mx.m4)
|
||||
m4_include(config/ompi_check_bproc.m4)
|
||||
m4_include(config/ompi_check_lsf.m4)
|
||||
m4_include(config/ompi_check_xcpu.m4)
|
||||
m4_include(config/ompi_check_mvapi.m4)
|
||||
m4_include(config/ompi_check_openib.m4)
|
||||
|
36
config/ompi_check_lsf.m4
Обычный файл
36
config/ompi_check_lsf.m4
Обычный файл
@ -0,0 +1,36 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# check for lsf
|
||||
# OMPI_CHECK_LSF(prefix, [action-if-found], [action-if-not-found])
|
||||
# --------------------------------------------------------
|
||||
AC_DEFUN([OMPI_CHECK_LSF],[
|
||||
AC_ARG_WITH([lsf],
|
||||
[AC_HELP_STRING([--with-lsf],
|
||||
[Directory where the LSF software is installed])])
|
||||
|
||||
ompi_check_lsf_found=no
|
||||
AS_IF([test "$with_lsf" = "no"],
|
||||
[ompi_check_lsf_happy="no"],
|
||||
[ompi_check_lsf_happy="yes"
|
||||
AS_IF([test ! -z "$with_lsf" -a "$with_lsf" != "yes"],
|
||||
[ompi_check_lsf_dir="$with_lsf"],
|
||||
[ompi_check_lsf_dir=""])])
|
||||
|
||||
])
|
0
orte/mca/pls/lsf/.ompi_ignore
Обычный файл
0
orte/mca/pls/lsf/.ompi_ignore
Обычный файл
2
orte/mca/pls/lsf/.ompi_unignore
Обычный файл
2
orte/mca/pls/lsf/.ompi_unignore
Обычный файл
@ -0,0 +1,2 @@
|
||||
rhc
|
||||
jsquyres
|
56
orte/mca/pls/lsf/Makefile.am
Обычный файл
56
orte/mca/pls/lsf/Makefile.am
Обычный файл
@ -0,0 +1,56 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(pls_lsf_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-pls-lsf.txt
|
||||
|
||||
sources = \
|
||||
pls_lsf.h \
|
||||
pls_lsf_component.c \
|
||||
pls_lsf_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pls_lsf_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_pls_lsf.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_pls_lsf.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_pls_lsf_la_SOURCES = $(component_sources)
|
||||
mca_pls_lsf_la_LDFLAGS = -module -avoid-version $(pls_lsf_LDFLAGS)
|
||||
mca_pls_lsf_la_LIBADD = \
|
||||
$(pls_lsf_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_pls_lsf_la_SOURCES = $(lib_sources)
|
||||
libmca_pls_lsf_la_LDFLAGS = -module -avoid-version $(pls_lsf_LDFLAGS)
|
||||
libmca_pls_lsf_la_LIBADD = $(pls_lsf_LIBS)
|
37
orte/mca/pls/lsf/configure.m4
Обычный файл
37
orte/mca/pls/lsf/configure.m4
Обычный файл
@ -0,0 +1,37 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_lsf_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_lsf_CONFIG],[
|
||||
OMPI_CHECK_LSF([pls_lsf], [pls_lsf_good=1], [pls_lsf_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$pls_lsf_good" = "1"],
|
||||
[pls_lsf_WRAPPER_EXTRA_LDFLAGS="$pls_lsf_LDFLAGS"
|
||||
pls_lsf_WRAPPER_EXTRA_LIBS="$pls_lsf_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([pls_lsf_CPPFLAGS])
|
||||
AC_SUBST([pls_lsf_LDFLAGS])
|
||||
AC_SUBST([pls_lsf_LIBS])
|
||||
])dnl
|
22
orte/mca/pls/lsf/configure.params
Обычный файл
22
orte/mca/pls/lsf/configure.params
Обычный файл
@ -0,0 +1,22 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
57
orte/mca/pls/lsf/help-pls-lsf.txt
Обычный файл
57
orte/mca/pls/lsf/help-pls-lsf.txt
Обычный файл
@ -0,0 +1,57 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[tm-bad-launchid]
|
||||
The TM (PBS / Torque) process starter cannot spawn the specified
|
||||
application on a remote node due to an invalid launch_id.
|
||||
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
This is most likely due to use of the "--hostfile" option to the
|
||||
command line. At this time, Open MPI/OpenRTE do not support this
|
||||
method of operation. Instead, the system expects to directly read
|
||||
information regarding the nodes to be used from the environment.
|
||||
|
||||
Removing "--hostfile" from the command line will likely allow the
|
||||
application to be launched. This will be fixed in a future release
|
||||
to support the use of "--hostfile" on the command line.
|
||||
#
|
||||
[multiple-prefixes]
|
||||
Multiple different --prefix options were specified to mpirun for the
|
||||
same node. This is a fatal error for the TM (PBS / Torque) process
|
||||
starter in Open MPI.
|
||||
|
||||
The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[tm-spawn-failed]
|
||||
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
|
||||
on a remote node.
|
||||
|
||||
Command line: %s
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Ensure that the executable "orted" is in your PATH
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
49
orte/mca/pls/lsf/pls_lsf.h
Обычный файл
49
orte/mca/pls/lsf/pls_lsf.h
Обычный файл
@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLS_LSF_H
|
||||
#define ORTE_PLS_LSF_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct orte_pls_lsf_component_t {
|
||||
orte_pls_base_component_t super;
|
||||
int priority;
|
||||
int debug;
|
||||
int verbose;
|
||||
bool timing;
|
||||
};
|
||||
typedef struct orte_pls_lsf_component_t orte_pls_lsf_component_t;
|
||||
|
||||
/* Globally exported variables */
|
||||
ORTE_DECLSPEC extern orte_pls_lsf_component_t mca_pls_lsf_component;
|
||||
extern orte_pls_base_module_t orte_pls_lsf_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ORTE_PLS_LSFH */
|
152
orte/mca/pls/lsf/pls_lsf_component.c
Обычный файл
152
orte/mca/pls/lsf/pls_lsf_component.c
Обычный файл
@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_lsf.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the pls lsf component version number
|
||||
*/
|
||||
const char *mca_pls_lsf_component_version_string =
|
||||
"Open MPI lsf pls MCA component version " ORTE_VERSION;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int pls_lsf_open(void);
|
||||
static int pls_lsf_close(void);
|
||||
static orte_pls_base_module_t *pls_lsf_init(int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_pls_lsf_component_t mca_pls_lsf_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a pls v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
ORTE_PLS_BASE_VERSION_1_3_0,
|
||||
|
||||
/* Component name and version */
|
||||
"lsf",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
pls_lsf_open,
|
||||
pls_lsf_close,
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
pls_lsf_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int pls_lsf_open(void)
|
||||
{
|
||||
int tmp, value;
|
||||
mca_base_component_t *comp = &mca_pls_lsf_component.super.pls_version;
|
||||
|
||||
mca_base_param_reg_int(comp, "debug", "Enable debugging of the LSF pls",
|
||||
false, false, 0, &mca_pls_lsf_component.debug);
|
||||
mca_base_param_reg_int(comp, "verbose", "Enable verbose output of the LSF pls",
|
||||
false, false, 0, &mca_pls_lsf_component.verbose);
|
||||
|
||||
mca_base_param_reg_int(comp, "priority", "Default selection priority",
|
||||
false, false, 75, &mca_pls_lsf_component.priority);
|
||||
|
||||
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
mca_pls_lsf_component.timing = true;
|
||||
} else {
|
||||
mca_pls_lsf_component.timing = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pls_lsf_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_pls_base_module_t *pls_lsf_init(int *priority)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* if we are NOT an HNP, then don't select us */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init() < 0) {
|
||||
/* nope, not here */
|
||||
opal_output_verbose(10, orte_pls_base.pls_output,
|
||||
"pls:lsf: NOT available for selection");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ensure the receive gets posted */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
*priority = mca_pls_lsf_component.priority;
|
||||
return &orte_pls_lsf_module;
|
||||
}
|
443
orte/mca/pls/lsf/pls_lsf_module.c
Обычный файл
443
orte/mca/pls/lsf/pls_lsf_module.c
Обычный файл
@ -0,0 +1,443 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_lsf.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int pls_lsf_launch_job(orte_jobid_t jobid);
|
||||
static int pls_lsf_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_lsf_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_lsf_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_lsf_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_lsf_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_lsf_finalize(void);
|
||||
|
||||
static int pls_lsf_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix);
|
||||
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_pls_base_module_1_3_0_t orte_pls_lsf_module = {
|
||||
pls_lsf_launch_job,
|
||||
pls_lsf_terminate_job,
|
||||
pls_lsf_terminate_orteds,
|
||||
pls_lsf_terminate_proc,
|
||||
pls_lsf_signal_job,
|
||||
pls_lsf_signal_proc,
|
||||
pls_lsf_finalize
|
||||
};
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
{
|
||||
orte_job_map_t *map = NULL;
|
||||
opal_list_item_t *item;
|
||||
size_t num_nodes;
|
||||
char *param;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
char *tmp;
|
||||
char** env = NULL;
|
||||
char* var;
|
||||
char *nodelist_flat;
|
||||
char **nodelist_argv;
|
||||
int nodelist_argc;
|
||||
orte_process_name_t name;
|
||||
char *name_string;
|
||||
char **custom_strings;
|
||||
int num_args, i;
|
||||
char *cur_prefix;
|
||||
struct timeval joblaunchstart, launchstart, launchstop;
|
||||
int proc_name_index = 0;
|
||||
bool failed_launch = true;
|
||||
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||
}
|
||||
}
|
||||
|
||||
/* save the active jobid */
|
||||
active_job = jobid;
|
||||
|
||||
/* Query the map for this job.
|
||||
* We need the entire mapping for a couple of reasons:
|
||||
* - need the prefix to start with.
|
||||
* - need to know if we are launching on a subset of the allocated nodes
|
||||
* All other mapping responsibilities fall to orted in the fork PLS
|
||||
*/
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* account for any reuse of daemons */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
num_nodes = map->num_new_daemons;
|
||||
if (num_nodes == 0) {
|
||||
/* nothing to do - just return */
|
||||
failed_launch = false;
|
||||
rc = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* create nodelist */
|
||||
nodelist_argv = NULL;
|
||||
nodelist_argc = 0;
|
||||
|
||||
for (item = opal_list_get_first(&map->nodes);
|
||||
item != opal_list_get_end(&map->nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
|
||||
|
||||
/* if the daemon already exists on this node, then
|
||||
* don't include it
|
||||
*/
|
||||
if (node->daemon_preexists) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* otherwise, add it to the list of nodes upon which
|
||||
* we need to launch a daemon
|
||||
*/
|
||||
opal_argv_append(&nodelist_argc, &nodelist_argv, node->nodename);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* start building argv array
|
||||
*/
|
||||
argv = NULL;
|
||||
argc = 0;
|
||||
|
||||
/*
|
||||
* ORTED OPTIONS
|
||||
*/
|
||||
|
||||
/* add the daemon command (as specified by user) */
|
||||
opal_argv_append(&argc, &argv, mca_pls_lsf_component.orted);
|
||||
opal_argv_append(&argc, &argv, "--no-daemonize");
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_pls_base_orted_append_basic_args(&argc, &argv,
|
||||
&proc_name_index,
|
||||
NULL,
|
||||
num_nodes
|
||||
);
|
||||
|
||||
/* force orted to use the lsf sds */
|
||||
opal_argv_append(&argc, &argv, "--ns-nds");
|
||||
opal_argv_append(&argc, &argv, "lsf");
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
* their own name on the other end
|
||||
*/
|
||||
name.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
name.jobid = 0;
|
||||
name.vpid = map->daemon_vpid_start;
|
||||
rc = orte_ns.get_proc_name_string(&name_string, &name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls_lsf: unable to create process name");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
free(argv[proc_name_index]);
|
||||
argv[proc_name_index] = strdup(name_string);
|
||||
free(name_string);
|
||||
|
||||
if (mca_pls_lsf_component.debug) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
if (NULL != param) {
|
||||
opal_output(0, "pls:lsf: final top-level argv:");
|
||||
opal_output(0, "pls:lsf: %s", param);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy the prefix-directory specified in the
|
||||
corresponding app_context. If there are multiple,
|
||||
different prefix's in the app context, complain (i.e., only
|
||||
allow one --prefix option for the entire slurm run -- we
|
||||
don't support different --prefix'es for different nodes in
|
||||
the SLURM pls) */
|
||||
cur_prefix = NULL;
|
||||
for (i=0; i < map->num_apps; i++) {
|
||||
char * app_prefix_dir = map->apps[i]->prefix_dir;
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
if (NULL != cur_prefix &&
|
||||
0 != strcmp (cur_prefix, app_prefix_dir)) {
|
||||
opal_show_help("help-pls-lsf.txt", "multiple-prefixes",
|
||||
true, cur_prefix, app_prefix_dir);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/* If not yet set, copy it; iff set, then it's the
|
||||
same anyway */
|
||||
if (NULL == cur_prefix) {
|
||||
cur_prefix = strdup(app_prefix_dir);
|
||||
if (mca_pls_lsf_component.debug) {
|
||||
opal_output (0, "pls:lsf: Set prefix:%s",
|
||||
cur_prefix);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(environ);
|
||||
var = mca_base_param_environ_variable("seed", NULL, NULL);
|
||||
opal_setenv(var, "0", true, &env);
|
||||
free(var);
|
||||
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain start time");
|
||||
}
|
||||
}
|
||||
|
||||
/* exec the daemon(s). Do NOT wait for lsb_launch to complete as it only
|
||||
* completes when the processes it starts - in this case, the orteds -
|
||||
* complete. We need to go ahead and return so orterun can do the rest
|
||||
* of its stuff. Instead, we'll catch any failures and deal with them elsewhere
|
||||
*/
|
||||
if (0 > lsb_launch(nodeargv, argv, LSF_DJOB_NOWAIT, env)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* declare the launch a success */
|
||||
failed_launch = false;
|
||||
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "pls_lsf: daemon block launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
opal_output(0, "pls_lsf: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:lsf: start_procs returned error %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != map) {
|
||||
OBJ_RELEASE(map);
|
||||
}
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int pls_lsf_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
static int pls_lsf_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* The way we've used SLURM, we can't kill individual processes --
|
||||
* we'll kill the entire job
|
||||
*/
|
||||
static int pls_lsf_terminate_proc(const orte_process_name_t *name)
|
||||
{
|
||||
opal_output(0, "pls:lsf:terminate_proc: not supported");
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Signal all the processes in the child srun by sending the signal directly to it
|
||||
*/
|
||||
static int pls_lsf_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Signal a specific process
|
||||
*/
|
||||
static int pls_lsf_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||
{
|
||||
opal_output(0, "pls:lsf:signal_proc: not supported");
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
static int pls_lsf_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void lsf_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
/* not sure yet about how this will be used */
|
||||
|
||||
int rc;
|
||||
|
||||
if (0 != status) {
|
||||
/* we have a problem */
|
||||
opal_output(0, "ERROR: lsb_launch failed to start the required daemons.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
|
||||
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
|
||||
|
||||
/* set the job state so we know it failed to start */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* force termination of the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -111,10 +111,19 @@ static orte_ras_base_module_t *orte_ras_bjs_init(int* priority)
|
||||
}
|
||||
|
||||
#if 0
|
||||
if(getenv("NODES") == NULL) {
|
||||
/* see if bjs is running */
|
||||
if (getenv("BJS_SOCKET") == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* bjs sticks our allocation into a NODES enviro variable -
|
||||
* see if it is there. If not, then nothing was allocated
|
||||
*/
|
||||
if (getenv("NODES") == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
*priority = mca_ras_bjs_component.priority;
|
||||
return &orte_ras_bjs_module;
|
||||
}
|
||||
|
0
orte/mca/ras/bproc_raw/.ompi_ignore
Обычный файл
0
orte/mca/ras/bproc_raw/.ompi_ignore
Обычный файл
2
orte/mca/ras/bproc_raw/.ompi_unignore
Обычный файл
2
orte/mca/ras/bproc_raw/.ompi_unignore
Обычный файл
@ -0,0 +1,2 @@
|
||||
rhc
|
||||
jsquyres
|
55
orte/mca/ras/bproc_raw/Makefile.am
Обычный файл
55
orte/mca/ras/bproc_raw/Makefile.am
Обычный файл
@ -0,0 +1,55 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
dist_pkgdata_DATA = help-ras-bproc-raw.txt
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_bproc_raw_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_bproc_raw.la
|
||||
else
|
||||
component_noinst = libmca_ras_bproc_raw.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS= $(ras_bproc_raw_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_bproc_raw.c \
|
||||
ras_bproc_raw.h \
|
||||
ras_bproc_raw_component.c
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_bproc_raw_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_bproc_raw_la_LIBADD = \
|
||||
$(ras_bproc_raw_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
mca_ras_bproc_raw_la_LDFLAGS = -module -avoid-version $(ras_bproc_raw_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_bproc_raw_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_bproc_raw_la_LIBADD = $(ras_bproc_raw_LIBS)
|
||||
libmca_ras_bproc_raw_la_LDFLAGS = -module -avoid-version $(ras_bproc_raw_LDFLAGS)
|
@ -17,22 +17,22 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ras_lsf_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# MCA_ras_bproc_raw_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_ras_lsf_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([ras_lsf_bproc], [ras_lsf_bproc_good=1],
|
||||
[ras_lsf_bproc_good=1], [ras_lsf_bproc_good=0])
|
||||
AC_DEFUN([MCA_ras_bproc_raw_CONFIG],[
|
||||
OMPI_CHECK_BPROC([ras_bproc_raw], [ras_bproc_raw_good=1], [ras_bproc_raw_good=1],
|
||||
[ras_bproc_raw_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ras_lsf_bproc_good" = "1"],
|
||||
[ras_lsf_bproc_WRAPPER_EXTRA_LDFLAGS="$ras_lsf_bproc_LDFLAGS"
|
||||
ras_lsf_bproc_WRAPPER_EXTRA_LIBS="$ras_lsf_bproc_LIBS"
|
||||
AS_IF([test "$ras_bproc_raw_good" = "1"],
|
||||
[ras_bproc_raw_WRAPPER_EXTRA_LDFLAGS="$ras_bproc_raw_LDFLAGS"
|
||||
ras_bproc_raw_WRAPPER_EXTRA_LIBS="$ras_bproc_raw_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ras_lsf_bproc_CPPFLAGS])
|
||||
AC_SUBST([ras_lsf_bproc_LDFLAGS])
|
||||
AC_SUBST([ras_lsf_bproc_LIBS])
|
||||
AC_SUBST([ras_bproc_raw_CPPFLAGS])
|
||||
AC_SUBST([ras_bproc_raw_LDFLAGS])
|
||||
AC_SUBST([ras_bproc_raw_LIBS])
|
||||
])dnl
|
36
orte/mca/ras/bproc_raw/help-ras-bproc-raw.txt
Обычный файл
36
orte/mca/ras/bproc_raw/help-ras-bproc-raw.txt
Обычный файл
@ -0,0 +1,36 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI MCA error messages.
|
||||
#
|
||||
[nodelist-failed]
|
||||
While trying to determine what resources are available, Bproc failed when
|
||||
queried for a list of available nodes. This may indicate a problem with
|
||||
Bproc or your cluster.
|
||||
|
||||
[no-nodes-found]
|
||||
While trying to determine what resources are available, Bproc returned
|
||||
a zero-length list of available nodes. This may indicate a problem
|
||||
with Bproc or your cluster.
|
||||
|
||||
[no-nodes-avail]
|
||||
While trying to determine what resources are available, Bproc returned
|
||||
a list of available nodes that didn't contain any upon which you have
|
||||
execution privileges. This may indicate a problem with Bproc, your cluster,
|
||||
or your privileges.
|
||||
|
181
orte/mca/ras/bproc_raw/ras_bproc_raw.c
Обычный файл
181
orte/mca/ras/bproc_raw/ras_bproc_raw.c
Обычный файл
@ -0,0 +1,181 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_bproc_raw.h"
|
||||
|
||||
|
||||
/**
|
||||
* Query the bproc node status
|
||||
*/
|
||||
|
||||
static int orte_ras_bproc_raw_node_state(int node)
|
||||
{
|
||||
#if defined BPROC_API_VERSION && BPROC_API_VERSION >= 4
|
||||
char nodestatus[BPROC_STATE_LEN + 1];
|
||||
|
||||
bproc_nodestatus(node, nodestatus, sizeof(nodestatus));
|
||||
if (strcmp(nodestatus, "up") == 0)
|
||||
return ORTE_NODE_STATE_UP;
|
||||
if (strcmp(nodestatus, "down") == 0)
|
||||
return ORTE_NODE_STATE_DOWN;
|
||||
if (strcmp(nodestatus, "boot") == 0)
|
||||
return ORTE_NODE_STATE_REBOOT;
|
||||
return ORTE_NODE_STATE_UNKNOWN;
|
||||
#else
|
||||
switch(bproc_nodestatus(node)) {
|
||||
case bproc_node_up:
|
||||
return ORTE_NODE_STATE_UP;
|
||||
case bproc_node_down:
|
||||
return ORTE_NODE_STATE_DOWN;
|
||||
case bproc_node_boot:
|
||||
return ORTE_NODE_STATE_REBOOT;
|
||||
default:
|
||||
return ORTE_NODE_STATE_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Discover available nodes. Allocate anything found
|
||||
* that is accessible by this user to the job.
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_ras_bproc_raw_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
|
||||
struct bproc_node_info_t *ni;
|
||||
opal_list_t nodes;
|
||||
orte_ras_node_t *node;
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
int i;
|
||||
|
||||
/* get the list of all nodes in this cluster */
|
||||
if (bproc_nodelist(&ns) < 0) {
|
||||
opal_show_help("help-ras-broc-raw.txt", "nodelist-failed", true);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/* if no nodes available, let the user know and return error */
|
||||
if (0 == ns->size) {
|
||||
opal_show_help("help-ras-broc-raw.txt", "no-nodes-found", true);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/* setup to record the nodes */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
/* cycle through the list */
|
||||
for (i=0; i < ns->size; i++) {
|
||||
ni = &ns->node[i];
|
||||
|
||||
/* check that the node is alive */
|
||||
if(orte_ras_bproc_raw_node_state(ni->node) != ORTE_NODE_STATE_UP) {
|
||||
/* if not, ignore this entry */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* are we allowed to execute on this node */
|
||||
if(bproc_access(ni->node, BPROC_X_OK) != 0) {
|
||||
/* if not, ignore this entry */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* okay, we have access and it is alive - create a new node entry */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
asprintf(&node->node_name, "%d", ni->node);
|
||||
node->node_state = ni->status;
|
||||
/* RHC - until we can find some way of querying bproc for the number of
|
||||
* available slots, just assume two
|
||||
*/
|
||||
node->node_slots = 2;
|
||||
opal_list_append(&nodes, &node->super);
|
||||
}
|
||||
|
||||
/* add any newly discovered nodes to the registry */
|
||||
if (0 < opal_list_get_size(&nodes)) {
|
||||
rc = orte_ras_base_node_insert(&nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* we didn't find anything - report that and return error */
|
||||
opal_show_help("help-ras-broc-raw.txt", "no-nodes-avail", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* now allocate them to this job */
|
||||
rc = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
|
||||
bproc_nodeset_free(&ns);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_ras_bproc_raw_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_bproc_raw_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_bproc_raw_module = {
|
||||
orte_ras_bproc_raw_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_bproc_raw_deallocate,
|
||||
orte_ras_bproc_raw_finalize
|
||||
};
|
||||
|
51
orte/mca/ras/bproc_raw/ras_bproc_raw.h
Обычный файл
51
orte/mca/ras/bproc_raw/ras_bproc_raw.h
Обычный файл
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (BPROC - without any resource manager)
|
||||
*/
|
||||
#ifndef ORTE_RAS_BPROC_RAW_H
|
||||
#define ORTE_RAS_BPROC_RAW_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* RAS Component
|
||||
*/
|
||||
struct orte_ras_bproc_raw_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
char *schedule_policy;
|
||||
};
|
||||
typedef struct orte_ras_bproc_raw_component_t orte_ras_bproc_raw_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_bproc_raw_component_t mca_ras_bproc_raw_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_bproc_raw_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
119
orte/mca/ras/bproc_raw/ras_bproc_raw_component.c
Обычный файл
119
orte/mca/ras/bproc_raw/ras_bproc_raw_component.c
Обычный файл
@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ras_bproc_raw.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_ras_bproc_raw_open(void);
|
||||
static int orte_ras_bproc_raw_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_bproc_raw_init(int* priority);
|
||||
|
||||
|
||||
orte_ras_bproc_raw_component_t mca_ras_bproc_raw_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"bproc_raw", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_bproc_raw_open, /* component open */
|
||||
orte_ras_bproc_raw_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
orte_ras_bproc_raw_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_bproc_raw_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_ras_bproc_raw_component.super.ras_version;
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "debug",
|
||||
"Whether or not to enable debugging output for the BPROC-RAW component (0 or 1)",
|
||||
false, false, (int)false, &tmp);
|
||||
mca_ras_bproc_raw_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
/* we default to a low priority so that any bproc + resource manager combination
|
||||
* will override us
|
||||
*/
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Selection priority for BPROC_RAW component",
|
||||
false, false, 10, &mca_ras_bproc_raw_component.priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *orte_ras_bproc_raw_init(int* priority)
|
||||
{
|
||||
int ret;
|
||||
struct bproc_version_t version;
|
||||
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* okay, we are in an HNP - now check to see if BProc is running here */
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_bproc_raw_component.priority;
|
||||
return &orte_ras_bproc_raw_module;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_ras_bproc_raw_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
0
orte/mca/ras/lanl_bproc/.ompi_ignore
Обычный файл
0
orte/mca/ras/lanl_bproc/.ompi_ignore
Обычный файл
2
orte/mca/ras/lanl_bproc/.ompi_unignore
Обычный файл
2
orte/mca/ras/lanl_bproc/.ompi_unignore
Обычный файл
@ -0,0 +1,2 @@
|
||||
rhc
|
||||
jsquyres
|
56
orte/mca/ras/lanl_bproc/Makefile.am
Обычный файл
56
orte/mca/ras/lanl_bproc/Makefile.am
Обычный файл
@ -0,0 +1,56 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
dist_pkgdata_DATA = help-ras-lanl-bproc.txt
|
||||
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_lanl_bproc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_lanl_bproc.la
|
||||
else
|
||||
component_noinst = libmca_ras_lanl_bproc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS= $(ras_lanl_bproc_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_lanl_bproc.c \
|
||||
ras_lanl_bproc.h \
|
||||
ras_lanl_bproc_component.c
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_lanl_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_lanl_bproc_la_LIBADD = \
|
||||
$(ras_lanl_bproc_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
mca_ras_lanl_bproc_la_LDFLAGS = -module -avoid-version $(ras_lanl_bproc_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_lanl_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_lanl_bproc_la_LIBADD = $(ras_lanl_bproc_LIBS)
|
||||
libmca_ras_lanl_bproc_la_LDFLAGS = -module -avoid-version $(ras_lanl_bproc_LDFLAGS)
|
38
orte/mca/ras/lanl_bproc/configure.m4
Обычный файл
38
orte/mca/ras/lanl_bproc/configure.m4
Обычный файл
@ -0,0 +1,38 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ras_lanl_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_ras_lanl_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([ras_lanl_bproc], [ras_lanl_bproc_good=1], [ras_lanl_bproc_good=1],
|
||||
[ras_lanl_bproc_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ras_lanl_bproc_good" = "1"],
|
||||
[ras_lanl_bproc_WRAPPER_EXTRA_LDFLAGS="$ras_lanl_bproc_LDFLAGS"
|
||||
ras_lanl_bproc_WRAPPER_EXTRA_LIBS="$ras_lanl_bproc_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ras_lanl_bproc_CPPFLAGS])
|
||||
AC_SUBST([ras_lanl_bproc_LDFLAGS])
|
||||
AC_SUBST([ras_lanl_bproc_LIBS])
|
||||
])dnl
|
24
orte/mca/ras/lanl_bproc/configure.params
Обычный файл
24
orte/mca/ras/lanl_bproc/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
34
orte/mca/ras/lanl_bproc/help-ras-lanl-bproc.txt
Обычный файл
34
orte/mca/ras/lanl_bproc/help-ras-lanl-bproc.txt
Обычный файл
@ -0,0 +1,34 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI MCA error messages.
|
||||
#
|
||||
[nodelist-failed]
|
||||
While trying to determine what resources are available, Bproc failed to
|
||||
find a list of available nodes in your NODES environmental variable.
|
||||
This may indicate that you failed to request an allocation prior to
|
||||
executing your application, or a problem with Bproc or your cluster.
|
||||
|
||||
Please ensure you have requested and received an allocation and try again,
|
||||
or contact your system administrator for advice.
|
||||
#
|
||||
[no-nodes-avail]
|
||||
While trying to determine what resources are available, Bproc returned
|
||||
a list of available nodes that didn't contain any upon which you have
|
||||
execution privileges. This may indicate a problem with Bproc, your cluster,
|
||||
or your privileges.
|
298
orte/mca/ras/lanl_bproc/ras_lanl_bproc.c
Обычный файл
298
orte/mca/ras/lanl_bproc/ras_lanl_bproc.c
Обычный файл
@ -0,0 +1,298 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_lanl_bproc.h"
|
||||
|
||||
|
||||
/**
|
||||
* Query the bproc node status
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_node_state(int node)
|
||||
{
|
||||
#if defined BPROC_API_VERSION && BPROC_API_VERSION >= 4
|
||||
char nodestatus[BPROC_STATE_LEN + 1];
|
||||
|
||||
bproc_nodestatus(node, nodestatus, sizeof(nodestatus));
|
||||
if (strcmp(nodestatus, "up") == 0)
|
||||
return ORTE_NODE_STATE_UP;
|
||||
if (strcmp(nodestatus, "down") == 0)
|
||||
return ORTE_NODE_STATE_DOWN;
|
||||
if (strcmp(nodestatus, "boot") == 0)
|
||||
return ORTE_NODE_STATE_REBOOT;
|
||||
return ORTE_NODE_STATE_UNKNOWN;
|
||||
#else
|
||||
switch(bproc_nodestatus(node)) {
|
||||
case bproc_node_up:
|
||||
return ORTE_NODE_STATE_UP;
|
||||
case bproc_node_down:
|
||||
return ORTE_NODE_STATE_DOWN;
|
||||
case bproc_node_boot:
|
||||
return ORTE_NODE_STATE_REBOOT;
|
||||
default:
|
||||
return ORTE_NODE_STATE_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parse the NODELIST to determine the number of process
|
||||
* slots/processors available on the node.
|
||||
*/
|
||||
|
||||
static size_t orte_ras_lanl_bproc_node_slots(char* node_name)
|
||||
{
|
||||
static char** nodelist = NULL;
|
||||
char** ptr;
|
||||
size_t count = 0;
|
||||
if(nodelist == NULL)
|
||||
nodelist = opal_argv_split(getenv("NODELIST"), ',');
|
||||
ptr = nodelist;
|
||||
while(ptr && *ptr) {
|
||||
if(strcmp(*ptr, node_name) == 0)
|
||||
count++;
|
||||
ptr++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the node name to node number.
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_node_resolve(char* node_name, int* node_num)
|
||||
{
|
||||
/* for now we expect this to be the node number */
|
||||
if(NULL == node_name || sscanf(node_name, "%d", node_num) != 1)
|
||||
return ORTE_ERROR;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover the available resources.
|
||||
* - validate any nodes specified via hostfile/commandline
|
||||
* - check for additional nodes that have already been allocated
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_discover(
|
||||
opal_list_t* nodelist,
|
||||
orte_app_context_t** context,
|
||||
size_t num_context)
|
||||
{
|
||||
char* nodes;
|
||||
char* ptr;
|
||||
opal_list_item_t* item;
|
||||
opal_list_t new_nodes;
|
||||
int rc;
|
||||
|
||||
/* query the nodelist from the registry */
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query(nodelist))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* validate that any user supplied nodes actually exist, etc. */
|
||||
item = opal_list_get_first(nodelist);
|
||||
while(item != opal_list_get_end(nodelist)) {
|
||||
opal_list_item_t* next = opal_list_get_next(item);
|
||||
int node_num;
|
||||
|
||||
orte_ras_node_t* node = (orte_ras_node_t*)item;
|
||||
if(ORTE_SUCCESS != orte_ras_lanl_bproc_node_resolve(node->node_name, &node_num)) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(orte_ras_lanl_bproc_node_state(node_num) != ORTE_NODE_STATE_UP) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(bproc_access(node_num, BPROC_X_OK) != 0) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* try and determine the number of available slots */
|
||||
if(node->node_slots == 0) {
|
||||
node->node_slots = orte_ras_lanl_bproc_node_slots(node->node_name);
|
||||
}
|
||||
item = next;
|
||||
}
|
||||
|
||||
/* parse the node list and check node status/access */
|
||||
nodes = getenv("NODES");
|
||||
if (NULL == nodes) {
|
||||
opal_show_help("help-ras-lanl-bproc.txt", "nodelist-failed", true);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
|
||||
while(NULL != (ptr = strsep(&nodes,","))) {
|
||||
orte_ras_node_t *node;
|
||||
orte_node_state_t node_state;
|
||||
int node_num;
|
||||
|
||||
/* is this node already in the list */
|
||||
for(item = opal_list_get_first(nodelist);
|
||||
item != opal_list_get_end(nodelist);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_ras_node_t*)item;
|
||||
if(strcmp(node->node_name, ptr) == 0)
|
||||
break;
|
||||
}
|
||||
if(item != opal_list_get_end(nodelist))
|
||||
continue;
|
||||
if(sscanf(ptr, "%d", &node_num) != 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(ORTE_NODE_STATE_UP != (node_state = orte_ras_lanl_bproc_node_state(node_num))) {
|
||||
opal_output(0, "error: a specified node (%d) is not up.\n", node_num);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
if(bproc_access(node_num, BPROC_X_OK) != 0) {
|
||||
opal_output(0, "error: a specified node (%d) is not accessible.\n", node_num);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* create a new node entry */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = strdup(ptr);
|
||||
node->node_state = node_state;
|
||||
node->node_slots = orte_ras_lanl_bproc_node_slots(node->node_name);
|
||||
opal_list_append(&new_nodes, &node->super);
|
||||
}
|
||||
|
||||
/* add any newly discovered nodes to the registry */
|
||||
if(opal_list_get_size(&new_nodes)) {
|
||||
rc = orte_ras_base_node_insert(&new_nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we didn't find anything - report that and return error */
|
||||
opal_show_help("help-ras-lanl-broc.txt", "no-nodes-avail", true);
|
||||
rc = ORTE_ERR_NOT_AVAILABLE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* append them to the nodelist */
|
||||
while(NULL != (item = opal_list_remove_first(&new_nodes)))
|
||||
opal_list_append(nodelist, item);
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&new_nodes);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
orte_app_context_t **context = NULL;
|
||||
orte_std_cntr_t i, num_context = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_lanl_bproc_discover(&nodes, context, num_context))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while(NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
for(i=0; i<num_context; i++) {
|
||||
OBJ_RELEASE(context[i]);
|
||||
}
|
||||
if (NULL != context) {
|
||||
free(context);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_ras_lanl_bproc_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_lanl_bproc_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_lanl_bproc_module = {
|
||||
orte_ras_lanl_bproc_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_lanl_bproc_deallocate,
|
||||
orte_ras_lanl_bproc_finalize
|
||||
};
|
||||
|
50
orte/mca/ras/lanl_bproc/ras_lanl_bproc.h
Обычный файл
50
orte/mca/ras/lanl_bproc/ras_lanl_bproc.h
Обычный файл
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (BPROC for LANL machines)
|
||||
*/
|
||||
#ifndef ORTE_RAS_LANL_BPROC_H
|
||||
#define ORTE_RAS_LANL_BPROC_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* RAS Component
|
||||
*/
|
||||
struct orte_ras_lanl_bproc_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
char *schedule_policy;
|
||||
};
|
||||
typedef struct orte_ras_lanl_bproc_component_t orte_ras_lanl_bproc_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_lanl_bproc_component_t mca_ras_lanl_bproc_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_lanl_bproc_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
119
orte/mca/ras/lanl_bproc/ras_lanl_bproc_component.c
Обычный файл
119
orte/mca/ras/lanl_bproc/ras_lanl_bproc_component.c
Обычный файл
@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ras_lanl_bproc.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_open(void);
|
||||
static int orte_ras_lanl_bproc_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_lanl_bproc_init(int* priority);
|
||||
|
||||
|
||||
orte_ras_lanl_bproc_component_t mca_ras_lanl_bproc_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"lanl_bproc", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_lanl_bproc_open, /* component open */
|
||||
orte_ras_lanl_bproc_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
orte_ras_lanl_bproc_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_lanl_bproc_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_ras_lanl_bproc_component.super.ras_version;
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "debug",
|
||||
"Whether or not to enable debugging output for the LANL-BPROC component (0 or 1)",
|
||||
false, false, (int)false, &tmp);
|
||||
mca_ras_lanl_bproc_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
/* we default to a negative priority so that we will *only* be selected
|
||||
* if directed by the user via -mca ras lanl_bproc or -mca ras_lanl_bproc_priority xxx
|
||||
*/
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Selection priority for LANL-BPROC component",
|
||||
false, false, -1, &mca_ras_lanl_bproc_component.priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *orte_ras_lanl_bproc_init(int* priority)
|
||||
{
|
||||
int ret;
|
||||
struct bproc_version_t version;
|
||||
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* okay, we are in an HNP - now check to see if BProc is running here */
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_lanl_bproc_component.priority;
|
||||
return &orte_ras_lanl_bproc_module;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_ras_lanl_bproc_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
0
orte/mca/ras/lsf/.ompi_ignore
Обычный файл
0
orte/mca/ras/lsf/.ompi_ignore
Обычный файл
2
orte/mca/ras/lsf/.ompi_unignore
Обычный файл
2
orte/mca/ras/lsf/.ompi_unignore
Обычный файл
@ -0,0 +1,2 @@
|
||||
rhc
|
||||
jsquyres
|
@ -18,37 +18,38 @@
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
dist_pkgdata_DATA = help-ras-lsf.txt
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_lsf_bproc_DSO
|
||||
if OMPI_BUILD_ras_lsf_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_lsf_bproc.la
|
||||
component_install = mca_ras_lsf.la
|
||||
else
|
||||
component_noinst = libmca_ras_lsf_bproc.la
|
||||
component_noinst = libmca_ras_lsf.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS= $(ras_lsf_bproc_CPPFLAGS)
|
||||
AM_CPPFLAGS= $(ras_lsf_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_lsf_bproc.c \
|
||||
ras_lsf_bproc.h \
|
||||
ras_lsf_bproc_component.c
|
||||
ras_lsf.c \
|
||||
ras_lsf.h \
|
||||
ras_lsf_component.c
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_lsf_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_lsf_bproc_la_LIBADD = \
|
||||
$(ras_lsf_bproc_LIBS) \
|
||||
mca_ras_lsf_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_lsf_la_LIBADD = \
|
||||
$(ras_lsf_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
mca_ras_lsf_bproc_la_LDFLAGS = -module -avoid-version $(ras_lsf_bproc_LDFLAGS)
|
||||
mca_ras_lsf_la_LDFLAGS = -module -avoid-version $(ras_lsf_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_lsf_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_lsf_bproc_la_LIBADD = $(ras_lsf_bproc_LIBS)
|
||||
libmca_ras_lsf_bproc_la_LDFLAGS = -module -avoid-version $(ras_lsf_bproc_LDFLAGS)
|
||||
libmca_ras_lsf_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_lsf_la_LIBADD = $(ras_lsf_LIBS)
|
||||
libmca_ras_lsf_la_LDFLAGS = -module -avoid-version $(ras_lsf_LDFLAGS)
|
38
orte/mca/ras/lsf/configure.m4
Обычный файл
38
orte/mca/ras/lsf/configure.m4
Обычный файл
@ -0,0 +1,38 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ras_lsf_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_ras_lsf_CONFIG],[
|
||||
OMPI_CHECK_LSF([ras_lsf], [ras_lsf_good=1],
|
||||
[ras_lsf_good=1], [ras_lsf_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ras_lsf_good" = "1"],
|
||||
[ras_lsf_WRAPPER_EXTRA_LDFLAGS="$ras_lsf_LDFLAGS"
|
||||
ras_lsf_WRAPPER_EXTRA_LIBS="$ras_lsf_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ras_lsf_CPPFLAGS])
|
||||
AC_SUBST([ras_lsf_LDFLAGS])
|
||||
AC_SUBST([ras_lsf_LIBS])
|
||||
])dnl
|
24
orte/mca/ras/lsf/configure.params
Обычный файл
24
orte/mca/ras/lsf/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
30
orte/mca/ras/lsf/help-ras-lsf.txt
Обычный файл
30
orte/mca/ras/lsf/help-ras-lsf.txt
Обычный файл
@ -0,0 +1,30 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI MCA error messages.
|
||||
#
|
||||
[nodelist-failed]
|
||||
While trying to determine what resources are available, LSF failed when
|
||||
queried for a list of available nodes. This may indicate a problem with
|
||||
LSF or your cluster.
|
||||
|
||||
[no-nodes-avail]
|
||||
While trying to determine what resources are available, LSF returned
|
||||
a list of available nodes from which we were unable to extract anything
|
||||
usable. This may indicate a problem with LSF or your cluster.
|
||||
|
121
orte/mca/ras/lsf/ras_lsf.c
Обычный файл
121
orte/mca/ras/lsf/ras_lsf.c
Обычный файл
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_lsf.h"
|
||||
|
||||
|
||||
static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
char **nodelist;
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t *item;
|
||||
orte_ras_node_t *node;
|
||||
int i, rc, num_nodes;
|
||||
|
||||
/* get the list of allocated nodes */
|
||||
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
||||
opal_show_help("help-ras-lsf.txt", "nodelist-failed", true);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
node = NULL;
|
||||
|
||||
/* step through the list */
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
/* is this a repeat of the current node? */
|
||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||
/* it is a repeat - just bump the slot count */
|
||||
++node->node_slots;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* not a repeat - create a node entry for it */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = strdup(nodelist[i]);
|
||||
node->node_slots = 1;
|
||||
opal_list_append(&nodes, &node->super);
|
||||
|
||||
}
|
||||
|
||||
/* add any newly discovered nodes to the registry */
|
||||
if (0 < opal_list_get_size(&nodes)) {
|
||||
rc = orte_ras_base_node_insert(&nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
/* we didn't find anything - report that and return error */
|
||||
opal_show_help("help-ras-lsf.txt", "no-nodes-avail", true);
|
||||
rc = ORTE_ERR_NOT_AVAILABLE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* now allocate them to this job */
|
||||
rc = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
|
||||
/* release the nodelist from lsf */
|
||||
opal_argv_free(nodelist);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_ras_lsf_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_lsf_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_lsf_module = {
|
||||
orte_ras_lsf_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_lsf_deallocate,
|
||||
orte_ras_lsf_finalize
|
||||
};
|
||||
|
@ -18,12 +18,13 @@
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (LSF over BPROC)
|
||||
* Resource Allocation (LSF)
|
||||
*/
|
||||
#ifndef ORTE_RAS_LSF_BPROC_H
|
||||
#define ORTE_RAS_LSF_BPROC_H
|
||||
#ifndef ORTE_RAS_LSF_H
|
||||
#define ORTE_RAS_LSF_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -32,15 +33,15 @@ extern "C" {
|
||||
/**
|
||||
* RAS Component
|
||||
*/
|
||||
struct orte_ras_lsf_bproc_component_t {
|
||||
struct orte_ras_lsf_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
};
|
||||
typedef struct orte_ras_lsf_bproc_component_t orte_ras_lsf_bproc_component_t;
|
||||
typedef struct orte_ras_lsf_component_t orte_ras_lsf_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_lsf_bproc_module;
|
||||
ORTE_DECLSPEC extern orte_ras_lsf_component_t mca_ras_lsf_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_lsf_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
@ -22,18 +22,19 @@
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ras_lsf_bproc.h"
|
||||
|
||||
#include "ras_lsf.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_ras_lsf_bproc_open(void);
|
||||
static int orte_ras_lsf_bproc_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_lsf_bproc_init(int* priority);
|
||||
static int orte_ras_lsf_open(void);
|
||||
static int orte_ras_lsf_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_lsf_init(int* priority);
|
||||
|
||||
|
||||
orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component = {
|
||||
orte_ras_lsf_component_t mca_ras_lsf_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
@ -44,12 +45,12 @@ orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component = {
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"lsf_bproc", /* MCA component name */
|
||||
"lsf", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_lsf_bproc_open, /* component open */
|
||||
orte_ras_lsf_bproc_close /* component close */
|
||||
orte_ras_lsf_open, /* component open */
|
||||
orte_ras_lsf_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
@ -58,52 +59,54 @@ orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component = {
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
orte_ras_lsf_bproc_init
|
||||
orte_ras_lsf_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Convience functions to lookup MCA parameters
|
||||
*/
|
||||
static int orte_ras_lsf_bproc_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("ras","lsf_bproc",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_lsf_bproc_open(void)
|
||||
static int orte_ras_lsf_open(void)
|
||||
{
|
||||
mca_ras_lsf_bproc_component.debug = orte_ras_lsf_bproc_param_register_int("debug",1);
|
||||
mca_ras_lsf_bproc_component.priority = orte_ras_lsf_bproc_param_register_int("priority",-1);
|
||||
mca_base_component_t *c = &mca_ras_lsf_component.super.ras_version;
|
||||
int tmp;
|
||||
|
||||
mca_base_param_reg_int(c, "debug",
|
||||
"Whether or not to enable debugging output for the LSF component (0 or 1)",
|
||||
false, false, (int)false, &tmp);
|
||||
mca_ras_lsf_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Selection priority for LSF component",
|
||||
false, false, 75, &mca_ras_lsf_component.priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *orte_ras_lsf_bproc_init(int* priority)
|
||||
static orte_ras_base_module_t *orte_ras_lsf_init(int* priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_lsf_bproc_component.priority;
|
||||
return NULL;
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init() < 0) {
|
||||
/* nope, not here */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_lsf_component.priority;
|
||||
return &orte_ras_lsf_module;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_ras_lsf_bproc_close(void)
|
||||
static int orte_ras_lsf_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,55 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_lsf_bproc.h"
|
||||
|
||||
|
||||
static int orte_ras_lsf_bproc_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_ras_lsf_bproc_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_lsf_bproc_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_lsf_bproc_module = {
|
||||
orte_ras_lsf_bproc_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_lsf_bproc_deallocate,
|
||||
orte_ras_lsf_bproc_finalize
|
||||
};
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user