Next round of LSF commits. Getting farther, but it still doesn't
fully work yet (everything is still .ompi_ignore'ed for everyone). This commit was SVN r15398.
Этот коммит содержится в:
родитель
b9db0a4c2d
Коммит
b20248709a
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -22,15 +23,73 @@
|
||||
# --------------------------------------------------------
|
||||
AC_DEFUN([OMPI_CHECK_LSF],[
|
||||
AC_ARG_WITH([lsf],
|
||||
[AC_HELP_STRING([--with-lsf],
|
||||
[Directory where the LSF software is installed])])
|
||||
[AC_HELP_STRING([--with-lsf(=DIR)],
|
||||
[Build LSF support])])
|
||||
AC_ARG_WITH([lsf-libdir],
|
||||
[AC_HELP_STRING([--with-lsf-libdir=DIR],
|
||||
[Search for LSF libraries in DIR])])
|
||||
|
||||
# Defaults
|
||||
ompi_check_lsf_dir_msg="compiler default"
|
||||
ompi_check_lsf_libdir_msg="linker default"
|
||||
|
||||
# Save directory names if supplied
|
||||
AS_IF([test ! -z "$with_lsf" -a "$with_lsf" != "yes"],
|
||||
[ompi_check_lsf_dir="$with_lsf"
|
||||
ompi_check_lsf_dir_msg="$ompi_check_lsf_dir (from --with-lsf)"])
|
||||
AS_IF([test ! -z "$with_lsf_libdir" -a "$with_lsf_libdir" != "yes"],
|
||||
[ompi_check_lsf_libdir="$with_lsf_libdir"
|
||||
ompi_check_lsf_libdir_msg="$ompi_check_lsf_libdir (from --with-lsf-libdir)"])
|
||||
|
||||
# If no directories were specified, look for LSF_LIBDIR,
|
||||
# LSF_INCLUDEDIR, and/or LSF_ENVDIR.
|
||||
AS_IF([test -z "$ompi_check_lsf_dir" -a -z "$ompi_check_lsf_libdir"],
|
||||
[AS_IF([test ! -z "$LSF_ENVDIR" -a -z "$LSF_LIBDIR" -a -f "$LSF_ENVDIR/lsf.conf"],
|
||||
[LSF_LIBDIR=`egrep ^LSF_LIBDIR= $LSF_ENVDIR/lsf.conf | cut -d= -f2-`])
|
||||
AS_IF([test ! -z "$LSF_ENVDIR" -a -z "$LSF_INCLUDEDIR" -a -f "$LSF_ENVDIR/lsf.conf"],
|
||||
[LSF_INCLUDEDIR=`egrep ^LSF_INCLUDEDIR= $LSF_ENVDIR/lsf.conf | cut -d= -f2-`])
|
||||
AS_IF([test ! -z "$LSF_LIBDIR"],
|
||||
[ompi_check_lsf_libdir=$LSF_LIBDIR
|
||||
ompi_check_lsf_libdir_msg="$LSF_LIBDIR (from \$LSF_LIBDIR)"])
|
||||
AS_IF([test ! -z "$LSF_INCLUDEDIR"],
|
||||
[ompi_check_lsf_dir=`dirname $LSF_INCLUDEDIR`
|
||||
ompi_check_lsf_dir_msg="$ompi_check_lsf_dir (from \$LSF_INCLUDEDIR)"])])
|
||||
|
||||
ompi_check_lsf_found=no
|
||||
AS_IF([test "$with_lsf" = "no"],
|
||||
[ompi_check_lsf_happy="no"],
|
||||
[ompi_check_lsf_happy="yes"
|
||||
AS_IF([test ! -z "$with_lsf" -a "$with_lsf" != "yes"],
|
||||
[ompi_check_lsf_dir="$with_lsf"],
|
||||
[ompi_check_lsf_dir=""])])
|
||||
[ompi_check_lsf_happy="yes"])
|
||||
|
||||
ompi_check_lsf_$1_save_CPPFLAGS="$CPPFLAGS"
|
||||
ompi_check_lsf_$1_save_LDFLAGS="$LDFLAGS"
|
||||
ompi_check_lsf_$1_save_LIBS="$LIBS"
|
||||
|
||||
AS_IF([test "$ompi_check_lsf_happy" = "yes"],
|
||||
[AC_MSG_CHECKING([for LSF dir])
|
||||
AC_MSG_RESULT([$ompi_check_lsf_dir_msg])
|
||||
AC_MSG_CHECKING([for LSF library dir])
|
||||
AC_MSG_RESULT([$ompi_check_lsf_libdir_msg])
|
||||
OMPI_CHECK_PACKAGE([$1],
|
||||
[lsf/lsbatch.h],
|
||||
[bat],
|
||||
[lsb_launch],
|
||||
[-llsf],
|
||||
[$ompi_check_lsf_dir],
|
||||
[$ompi_check_lsf_libdir],
|
||||
[ompi_check_lsf_happy="yes"],
|
||||
[ompi_check_lsf_happy="no"])])
|
||||
|
||||
CPPFLAGS="$ompi_check_lsf_$1_save_CPPFLAGS"
|
||||
LDFLAGS="$ompi_check_lsf_$1_save_LDFLAGS"
|
||||
LIBS="$ompi_check_lsf_$1_save_LIBS"
|
||||
|
||||
# Reset for the next time we're called
|
||||
ompi_check_lsf_dir=
|
||||
ompi_check_lsf_libdir=
|
||||
|
||||
AS_IF([test "$ompi_check_lsf_happy" = "yes"],
|
||||
[$2],
|
||||
[AS_IF([test ! -z "$with_lsf" -a "$with_lsf" != "no"],
|
||||
[AC_MSG_WARN([LSF support requested (via --with-lsf) but not found.])
|
||||
AC_MSG_ERROR([Aborting.])])
|
||||
$3])
|
||||
])
|
||||
|
@ -9,6 +9,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -20,6 +21,12 @@ AM_CPPFLAGS = $(pls_lsf_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-pls-lsf.txt
|
||||
|
||||
bin_PROGRAMS = mytest
|
||||
|
||||
mytest_SOURCES = mytest.c
|
||||
mytest_LDFLAGS = $(pls_lsf_LDFLAGS)
|
||||
mytest_LDADD = $(pls_lsf_LIBS)
|
||||
|
||||
sources = \
|
||||
pls_lsf.h \
|
||||
pls_lsf_component.c \
|
||||
|
@ -114,8 +114,8 @@ static int pls_lsf_open(void)
|
||||
&mca_pls_lsf_component.orted);
|
||||
|
||||
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, 0, &value);
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
mca_pls_lsf_component.timing = true;
|
||||
} else {
|
||||
|
@ -88,9 +88,6 @@ static int pls_lsf_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *a
|
||||
static int pls_lsf_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
static int pls_lsf_finalize(void);
|
||||
|
||||
static int pls_lsf_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix);
|
||||
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -124,21 +121,19 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
char *tmp;
|
||||
char** env = NULL;
|
||||
char* var;
|
||||
char *nodelist_flat;
|
||||
char **nodelist_argv;
|
||||
int nodelist_argc;
|
||||
orte_process_name_t name;
|
||||
char *name_string;
|
||||
char **custom_strings;
|
||||
int num_args, i;
|
||||
int i;
|
||||
char *cur_prefix;
|
||||
struct timeval joblaunchstart, launchstart, launchstop;
|
||||
int proc_name_index = 0;
|
||||
bool failed_launch = true;
|
||||
|
||||
printf("pls lsf being used to launch!\n");
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||
@ -296,7 +291,22 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||
* failures and deal with them elsewhere
|
||||
*/
|
||||
if (0 > lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env)) {
|
||||
argv = NULL;
|
||||
argc = 0;
|
||||
opal_argv_append(&argc, &argv, "env");
|
||||
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
||||
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
opal_output(0, "got nonzero: %d", rc);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_output(0, "launched ok");
|
||||
sleep(5);
|
||||
exit(0);
|
||||
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
@ -336,7 +346,9 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_smr.set_job_state(jobid,
|
||||
ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -354,7 +366,8 @@ static int pls_lsf_terminate_job(orte_jobid_t jobid, struct timeval *timeout, op
|
||||
int rc;
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -419,29 +432,3 @@ static int pls_lsf_finalize(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void lsf_wait_cb(pid_t pid, int status, void* cbdata)
|
||||
{
|
||||
/* not sure yet about how this will be used */
|
||||
|
||||
int rc;
|
||||
|
||||
if (0 != status) {
|
||||
/* we have a problem */
|
||||
opal_output(0, "ERROR: lsb_launch failed to start the required daemons.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
|
||||
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
|
||||
|
||||
/* set the job state so we know it failed to start */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* force termination of the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -42,7 +42,7 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
opal_list_item_t *item;
|
||||
orte_ras_node_t *node;
|
||||
int i, rc, num_nodes;
|
||||
|
||||
|
||||
/* get the list of allocated nodes */
|
||||
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
||||
opal_show_help("help-ras-lsf.txt", "nodelist-failed", true);
|
||||
@ -54,6 +54,7 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
|
||||
/* step through the list */
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
printf("lsf got node: %s\n", nodelist[i]);
|
||||
/* is this a repeat of the current node? */
|
||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||
/* it is a repeat - just bump the slot count */
|
||||
@ -66,7 +67,6 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
node->node_name = strdup(nodelist[i]);
|
||||
node->node_slots = 1;
|
||||
opal_list_append(&nodes, &node->super);
|
||||
|
||||
}
|
||||
|
||||
/* add any newly discovered nodes to the registry */
|
||||
@ -97,7 +97,7 @@ cleanup:
|
||||
|
||||
/* release the nodelist from lsf */
|
||||
opal_argv_free(nodelist);
|
||||
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
0
orte/mca/sds/lsf/.ompi_ignore
Обычный файл
0
orte/mca/sds/lsf/.ompi_ignore
Обычный файл
2
orte/mca/sds/lsf/.ompi_unignore
Обычный файл
2
orte/mca/sds/lsf/.ompi_unignore
Обычный файл
@ -0,0 +1,2 @@
|
||||
rhc
|
||||
jsquyres
|
51
orte/mca/sds/lsf/Makefile.am
Обычный файл
51
orte/mca/sds/lsf/Makefile.am
Обычный файл
@ -0,0 +1,51 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(pls_lsf_CPPFLAGS)
|
||||
|
||||
sources = \
|
||||
sds_lsf.h \
|
||||
sds_lsf_component.c \
|
||||
sds_lsf_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_sds_lsf_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sds_lsf.la
|
||||
else
|
||||
component_noinst = libmca_sds_lsf.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sds_lsf_la_SOURCES = $(sources)
|
||||
mca_sds_lsf_la_LDFLAGS = -module -avoid-version $(sds_lsf_LDFLAGS)
|
||||
mca_sds_lsf_la_LIBADD = \
|
||||
$(sds_lsf_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sds_lsf_la_SOURCES =$(sources)
|
||||
libmca_sds_lsf_la_LDFLAGS = -module -avoid-version $(sds_lsf_LDFLAGS)
|
||||
libmca_sds_lsf_la_LIBADD = $(sds_lsf_LIBS)
|
38
orte/mca/sds/lsf/configure.m4
Обычный файл
38
orte/mca/sds/lsf/configure.m4
Обычный файл
@ -0,0 +1,38 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sds_lsf_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_sds_lsf_CONFIG],[
|
||||
OMPI_CHECK_LSF([sds_lsf], [sds_lsf_good=1], [sds_lsf_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$sds_lsf_good" = "1"],
|
||||
[sds_lsf_WRAPPER_EXTRA_LDFLAGS="$sds_lsf_LDFLAGS"
|
||||
sds_lsf_WRAPPER_EXTRA_LIBS="$sds_lsf_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([sds_lsf_CPPFLAGS])
|
||||
AC_SUBST([sds_lsf_LDFLAGS])
|
||||
AC_SUBST([sds_lsf_LIBS])
|
||||
])dnl
|
22
orte/mca/sds/lsf/configure.params
Обычный файл
22
orte/mca/sds/lsf/configure.params
Обычный файл
@ -0,0 +1,22 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
46
orte/mca/sds/lsf/sds_lsf.h
Обычный файл
46
orte/mca/sds/lsf/sds_lsf.h
Обычный файл
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_SDS_LSF_H
|
||||
#define ORTE_SDS_LSF_H
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sds_base_component_t mca_sds_lsf_component;
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_sds_lsf_component_open(void);
|
||||
int orte_sds_lsf_component_close(void);
|
||||
orte_sds_base_module_t* orte_sds_lsf_component_init(int *priority);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
int orte_sds_lsf_finalize(void);
|
||||
|
||||
/*
|
||||
* Module functions
|
||||
*/
|
||||
int orte_sds_lsf_set_name(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_SDS_LSF_H */
|
102
orte/mca/sds/lsf/sds_lsf_component.c
Обычный файл
102
orte/mca/sds/lsf/sds_lsf_component.c
Обычный файл
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <lsf/lsbatch.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/sds/sds.h"
|
||||
#include "orte/mca/sds/lsf/sds_lsf.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
extern orte_sds_base_module_t orte_sds_lsf_module;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
orte_sds_base_component_t mca_sds_lsf_component = {
|
||||
{
|
||||
/* Indicate that we are a sds v1.0.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
ORTE_SDS_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"lsf",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
orte_sds_lsf_component_open,
|
||||
orte_sds_lsf_component_close
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
orte_sds_lsf_component_init
|
||||
};
|
||||
|
||||
|
||||
int orte_sds_lsf_component_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_sds_base_module_t *orte_sds_lsf_component_init(int *priority)
|
||||
{
|
||||
int id;
|
||||
char *mode;
|
||||
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init("ORTE launcher") < 0) {
|
||||
/* nope, not here */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &mode);
|
||||
|
||||
if (NULL == mode || 0 != strcmp("lsf", mode)) {
|
||||
if (NULL != mode) {
|
||||
free(mode);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NULL != mode) {
|
||||
free(mode);
|
||||
}
|
||||
*priority = 20;
|
||||
return &orte_sds_lsf_module;
|
||||
}
|
||||
|
||||
|
||||
int orte_sds_lsf_component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
183
orte/mca/sds/lsf/sds_lsf_module.c
Обычный файл
183
orte/mca/sds/lsf/sds_lsf_module.c
Обычный файл
@ -0,0 +1,183 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <ctype.h>
|
||||
|
||||
#include <lsf/lsbatch.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/sds/sds.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/sds/lsf/sds_lsf.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
orte_sds_base_module_t orte_sds_lsf_module = {
|
||||
orte_sds_base_basic_contact_universe,
|
||||
orte_sds_lsf_set_name,
|
||||
orte_sds_lsf_finalize,
|
||||
};
|
||||
|
||||
static char *get_lsf_nodename(int nodeid);
|
||||
|
||||
|
||||
int orte_sds_lsf_set_name(void)
|
||||
{
|
||||
int rc;
|
||||
int id;
|
||||
char* name_string = NULL;
|
||||
int lsf_nodeid;
|
||||
|
||||
/* start by getting our cellid, jobid, and vpid (which is the
|
||||
starting vpid for the list of daemons) */
|
||||
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &name_string);
|
||||
|
||||
if (name_string != NULL) {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.convert_string_to_process_name(&(orte_process_info.my_name),
|
||||
name_string))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(name_string);
|
||||
return rc;
|
||||
}
|
||||
free(name_string);
|
||||
} else {
|
||||
orte_cellid_t cellid;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char* cellid_string;
|
||||
char* jobid_string;
|
||||
char* vpid_string;
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &cellid_string);
|
||||
if (NULL == cellid_string) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return(rc);
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &jobid_string);
|
||||
if (NULL == jobid_string) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.convert_string_to_jobid(&jobid, jobid_string))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return(rc);
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "vpid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &vpid_string);
|
||||
if (NULL == vpid_string) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.convert_string_to_vpid(&vpid, vpid_string))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||
cellid, jobid, vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* fix up the base name and make it the "real" name */
|
||||
lsf_nodeid = atoi(getenv("LSB_JOBINDEX"));
|
||||
orte_process_info.my_name->vpid += lsf_nodeid;
|
||||
|
||||
/* fix up the system info nodename to match exactly what lsf returned */
|
||||
if (NULL != orte_system_info.nodename) {
|
||||
free(orte_system_info.nodename);
|
||||
}
|
||||
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
||||
|
||||
/* get the non-name common environmental variables */
|
||||
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_sds_lsf_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static char *get_lsf_nodename(int nodeid)
|
||||
{
|
||||
char **names = NULL;
|
||||
char *lsf_nodelist;
|
||||
char *ret;
|
||||
|
||||
lsf_nodelist = getenv("OMPI_MCA_orte_lsf_nodelist");
|
||||
|
||||
if (NULL == lsf_nodelist) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* split the node list into an argv array */
|
||||
names = opal_argv_split(lsf_nodelist, ',');
|
||||
if (NULL == names) { /* got an error */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check to see if there are enough entries */
|
||||
if (nodeid > opal_argv_count(names)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = strdup(names[nodeid]);
|
||||
|
||||
opal_argv_free(names);
|
||||
|
||||
/* All done */
|
||||
return ret;
|
||||
}
|
Загрузка…
Ссылка в новой задаче
Block a user