Add ORTE ALPS support (Cray XT CNL)
This commit was SVN r17482.
Этот коммит содержится в:
родитель
cec3d96a94
Коммит
18d1d3b408
49
orte/mca/pls/alps/Makefile.am
Обычный файл
49
orte/mca/pls/alps/Makefile.am
Обычный файл
@ -0,0 +1,49 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
pls_alps.h \
|
||||||
|
pls_alps_component.c \
|
||||||
|
pls_alps_module.c
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-pls-alps.txt
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if OMPI_BUILD_pls_alps_DSO
|
||||||
|
component_noinst =
|
||||||
|
component_install = mca_pls_alps.la
|
||||||
|
else
|
||||||
|
component_noinst = libmca_pls_alps.la
|
||||||
|
component_install =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_pls_alps_la_SOURCES = $(sources)
|
||||||
|
mca_pls_alps_la_LDFLAGS = -module -avoid-version
|
||||||
|
mca_pls_alps_la_LIBADD = \
|
||||||
|
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||||
|
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_pls_alps_la_SOURCES =$(sources)
|
||||||
|
libmca_pls_alps_la_LDFLAGS = -module -avoid-version
|
25
orte/mca/pls/alps/configure.m4
Обычный файл
25
orte/mca/pls/alps/configure.m4
Обычный файл
@ -0,0 +1,25 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# MCA_pls_alps_CONFIG([action-if-found], [action-if-not-found])
|
||||||
|
# -----------------------------------------------------------
|
||||||
|
AC_DEFUN([MCA_pls_alps_CONFIG],[
|
||||||
|
OMPI_CHECK_ALPS([pls_alps], [$1], [$2])
|
||||||
|
])dnl
|
23
orte/mca/pls/alps/configure.params
Обычный файл
23
orte/mca/pls/alps/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
PARAM_CONFIG_FILES="Makefile"
|
42
orte/mca/pls/alps/help-pls-alps.txt
Обычный файл
42
orte/mca/pls/alps/help-pls-alps.txt
Обычный файл
@ -0,0 +1,42 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
[multiple-prefixes]
|
||||||
|
The ALPS process starter for Open MPI does not support multiple
|
||||||
|
different --prefix options to mpirun. You can specify at most one
|
||||||
|
unique value for the --prefix option (in any of the application
|
||||||
|
contexts); it will be applied to all the application contexts of your
|
||||||
|
parallel job.
|
||||||
|
|
||||||
|
Put simply, you must have Open MPI installed in the same location on
|
||||||
|
all of your ALPS nodes.
|
||||||
|
|
||||||
|
Multiple different --prefix options were specified to mpirun. This is
|
||||||
|
a fatal error for the ALPS process starter in Open MPI.
|
||||||
|
|
||||||
|
The first two prefix values supplied were:
|
||||||
|
%s
|
||||||
|
and %s
|
||||||
|
#
|
||||||
|
[no-hosts-in-list]
|
||||||
|
The ALPS process starter for Open MPI didn't find any hosts in
|
||||||
|
the map for this application. This can be caused by a lack of
|
||||||
|
an allocation, or by an error in the Open MPI code. Please check
|
||||||
|
to ensure you have a ALPS allocation. If you do, then please pass
|
||||||
|
the error to the Open MPI user's mailing list for assistance.
|
54
orte/mca/pls/alps/pls_alps.h
Обычный файл
54
orte/mca/pls/alps/pls_alps.h
Обычный файл
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef ORTE_PLS_ALPS_EXPORT_H
|
||||||
|
#define ORTE_PLS_ALPS_EXPORT_H
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "opal/mca/mca.h"
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct orte_pls_alps_component_t {
|
||||||
|
orte_pls_base_component_t super;
|
||||||
|
int priority;
|
||||||
|
int debug;
|
||||||
|
bool timing;
|
||||||
|
char *orted;
|
||||||
|
char *custom_args;
|
||||||
|
};
|
||||||
|
typedef struct orte_pls_alps_component_t orte_pls_alps_component_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Globally exported variable
|
||||||
|
*/
|
||||||
|
|
||||||
|
ORTE_MODULE_DECLSPEC extern orte_pls_alps_component_t
|
||||||
|
mca_pls_alps_component;
|
||||||
|
ORTE_DECLSPEC extern orte_pls_base_module_t
|
||||||
|
orte_pls_alps_module;
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif /* ORTE_PLS_ALPS_EXPORT_H */
|
167
orte/mca/pls/alps/pls_alps_component.c
Обычный файл
167
orte/mca/pls/alps/pls_alps_component.c
Обычный файл
@ -0,0 +1,167 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
* These symbols are in a file by themselves to provide nice linker
|
||||||
|
* semantics. Since linkers generally pull in symbols by object
|
||||||
|
* files, keeping these symbols as the only symbols in this file
|
||||||
|
* prevents utility programs such as "ompi_info" from having to import
|
||||||
|
* entire components just to query their version and parameters.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
#include "orte/mca/pls/base/base.h"
|
||||||
|
#include "orte/mca/pls/base/pls_private.h"
|
||||||
|
#include "pls_alps.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Public string showing the pls ompi_alps component version number
|
||||||
|
*/
|
||||||
|
const char *mca_pls_alps_component_version_string =
|
||||||
|
"Open MPI alps pls MCA component version " ORTE_VERSION;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
static int pls_alps_open(void);
|
||||||
|
static int pls_alps_close(void);
|
||||||
|
static orte_pls_base_module_t *pls_alps_init(int *priority);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Instantiate the public struct with all of our public information
|
||||||
|
* and pointers to our public functions in it
|
||||||
|
*/
|
||||||
|
|
||||||
|
orte_pls_alps_component_t mca_pls_alps_component = {
|
||||||
|
|
||||||
|
{
|
||||||
|
/* First, the mca_component_t struct containing meta
|
||||||
|
information about the component itself */
|
||||||
|
|
||||||
|
{
|
||||||
|
/* Indicate that we are a pls v1.3.0 component (which also
|
||||||
|
implies a specific MCA version) */
|
||||||
|
|
||||||
|
ORTE_PLS_BASE_VERSION_1_3_0,
|
||||||
|
|
||||||
|
/* Component name and version */
|
||||||
|
|
||||||
|
"alps",
|
||||||
|
ORTE_MAJOR_VERSION,
|
||||||
|
ORTE_MINOR_VERSION,
|
||||||
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
|
/* Component open and close functions */
|
||||||
|
|
||||||
|
pls_alps_open,
|
||||||
|
pls_alps_close
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Next the MCA v1.0.0 component meta data */
|
||||||
|
|
||||||
|
{
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Initialization / querying functions */
|
||||||
|
|
||||||
|
pls_alps_init
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Other orte_pls_alps_component_t items -- left uninitialized
|
||||||
|
here; will be initialized in pls_alps_open() */
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static int pls_alps_open(void)
|
||||||
|
{
|
||||||
|
mca_base_component_t *comp = &mca_pls_alps_component.super.pls_version;
|
||||||
|
int tmp, value;
|
||||||
|
|
||||||
|
mca_base_param_reg_int(comp, "debug", "Enable debugging of alps pls",
|
||||||
|
false, false, 0,
|
||||||
|
&mca_pls_alps_component.debug);
|
||||||
|
if (mca_pls_alps_component.debug == 0) {
|
||||||
|
mca_base_param_reg_int_name("orte", "debug",
|
||||||
|
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
|
||||||
|
false, false, false, &mca_pls_alps_component.debug);
|
||||||
|
}
|
||||||
|
|
||||||
|
mca_base_param_reg_int(comp, "priority", "Default selection priority",
|
||||||
|
false, false, 75,
|
||||||
|
&mca_pls_alps_component.priority);
|
||||||
|
|
||||||
|
mca_base_param_reg_string(comp, "orted",
|
||||||
|
"Command to use to start proxy orted",
|
||||||
|
false, false, "orted",
|
||||||
|
&mca_pls_alps_component.orted);
|
||||||
|
|
||||||
|
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_pls_alps_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_pls_alps_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
mca_base_param_reg_string(comp, "args",
|
||||||
|
"Custom arguments to srun",
|
||||||
|
false, false, NULL,
|
||||||
|
&mca_pls_alps_component.custom_args);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static orte_pls_base_module_t *pls_alps_init(int *priority)
|
||||||
|
{
|
||||||
|
/* if we are NOT an HNP, then don't select us */
|
||||||
|
if (!orte_process_info.seed) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
*priority = mca_pls_alps_component.priority;
|
||||||
|
return &orte_pls_alps_module;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int pls_alps_close(void)
|
||||||
|
{
|
||||||
|
if (NULL != mca_pls_alps_component.orted) {
|
||||||
|
free(mca_pls_alps_component.orted);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NULL != mca_pls_alps_component.custom_args) {
|
||||||
|
free(mca_pls_alps_component.custom_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
630
orte/mca/pls/alps/pls_alps_module.c
Обычный файл
630
orte/mca/pls/alps/pls_alps_module.c
Обычный файл
@ -0,0 +1,630 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
* These symbols are in a file by themselves to provide nice linker
|
||||||
|
* semantics. Since linkers generally pull in symbols by object
|
||||||
|
* files, keeping these symbols as the only symbols in this file
|
||||||
|
* prevents utility programs such as "ompi_info" from having to import
|
||||||
|
* entire components just to query their version and parameters.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/orte_types.h"
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#include <signal.h>
|
||||||
|
#ifdef HAVE_STDLIB_H
|
||||||
|
#include <stdlib.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_TIME_H
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_STAT_H
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_FCNTL_H
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/mca/installdirs/installdirs.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
|
#include "opal/util/path.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/params.h"
|
||||||
|
#include "orte/runtime/runtime.h"
|
||||||
|
#include "orte/runtime/orte_wakeup.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
#include "orte/mca/ns/base/base.h"
|
||||||
|
#include "orte/mca/rml/rml.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/smr/smr.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
|
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
#include "orte/mca/pls/base/base.h"
|
||||||
|
#include "orte/mca/pls/base/pls_private.h"
|
||||||
|
#include "pls_alps.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
static int pls_alps_launch_job(orte_jobid_t jobid);
|
||||||
|
static int pls_alps_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||||
|
static int pls_alps_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||||
|
static int pls_alps_terminate_proc(const orte_process_name_t *name);
|
||||||
|
static int pls_alps_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||||
|
static int pls_alps_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||||
|
static int pls_alps_finalize(void);
|
||||||
|
|
||||||
|
static int pls_alps_start_proc(int argc, char **argv, char **env,
|
||||||
|
char *prefix);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Global variable
|
||||||
|
*/
|
||||||
|
orte_pls_base_module_1_3_0_t orte_pls_alps_module = {
|
||||||
|
pls_alps_launch_job,
|
||||||
|
pls_alps_terminate_job,
|
||||||
|
pls_alps_terminate_orteds,
|
||||||
|
pls_alps_terminate_proc,
|
||||||
|
pls_alps_signal_job,
|
||||||
|
pls_alps_signal_proc,
|
||||||
|
pls_alps_finalize
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local variables
|
||||||
|
*/
|
||||||
|
static pid_t alps_pid = 0;
|
||||||
|
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||||
|
static bool failed_launch;
|
||||||
|
|
||||||
|
|
||||||
|
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||||
|
* you encounter an error so that orterun will be woken up and
|
||||||
|
* the job can cleanly terminate
|
||||||
|
*/
|
||||||
|
static int pls_alps_launch_job(orte_jobid_t jobid)
|
||||||
|
{
|
||||||
|
orte_job_map_t *map = NULL;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
size_t num_nodes;
|
||||||
|
char *jobid_string = NULL;
|
||||||
|
char *param;
|
||||||
|
char **argv = NULL;
|
||||||
|
int argc;
|
||||||
|
int rc;
|
||||||
|
char *tmp;
|
||||||
|
char** env = NULL;
|
||||||
|
char* var;
|
||||||
|
char *nodelist_flat;
|
||||||
|
char **nodelist_argv;
|
||||||
|
int nodelist_argc;
|
||||||
|
orte_process_name_t name;
|
||||||
|
char *name_string;
|
||||||
|
char **custom_strings;
|
||||||
|
int num_args, i;
|
||||||
|
char *cur_prefix;
|
||||||
|
struct timeval joblaunchstart, launchstart, launchstop;
|
||||||
|
int proc_name_index = 0;
|
||||||
|
|
||||||
|
if (mca_pls_alps_component.timing) {
|
||||||
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_alps: could not obtain job start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* save the active jobid */
|
||||||
|
active_job = jobid;
|
||||||
|
|
||||||
|
/* indicate the state of the launch */
|
||||||
|
failed_launch = true;
|
||||||
|
|
||||||
|
/* Query the map for this job.
|
||||||
|
* We need the entire mapping for a couple of reasons:
|
||||||
|
* - need the prefix to start with.
|
||||||
|
* - need to know if we are launching on a subset of the allocated nodes
|
||||||
|
* All other mapping responsibilities fall to orted in the fork PLS
|
||||||
|
*/
|
||||||
|
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
num_nodes = map->num_new_daemons;
|
||||||
|
if (num_nodes == 0) {
|
||||||
|
/* no new daemons required - just launch apps */
|
||||||
|
goto launch_apps;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* need integer value for command line parameter */
|
||||||
|
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* start building argv array
|
||||||
|
*/
|
||||||
|
argv = NULL;
|
||||||
|
argc = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ALPS aprun OPTIONS
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* add the aprun command */
|
||||||
|
opal_argv_append(&argc, &argv, "aprun");
|
||||||
|
|
||||||
|
/* Append user defined arguments to aprun */
|
||||||
|
if ( NULL != mca_pls_alps_component.custom_args ) {
|
||||||
|
custom_strings = opal_argv_split(mca_pls_alps_component.custom_args, ' ');
|
||||||
|
num_args = opal_argv_count(custom_strings);
|
||||||
|
for (i = 0; i < num_args; ++i) {
|
||||||
|
opal_argv_append(&argc, &argv, custom_strings[i]);
|
||||||
|
}
|
||||||
|
opal_argv_free(custom_strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* number of processors needed */
|
||||||
|
asprintf(&tmp, "-n %lu", (unsigned long) num_nodes);
|
||||||
|
opal_argv_append(&argc, &argv, tmp);
|
||||||
|
free(tmp);
|
||||||
|
opal_argv_append(&argc, &argv, "-N 1");
|
||||||
|
|
||||||
|
/* create nodelist */
|
||||||
|
nodelist_argv = NULL;
|
||||||
|
nodelist_argc = 0;
|
||||||
|
|
||||||
|
for (item = opal_list_get_first(&map->nodes);
|
||||||
|
item != opal_list_get_end(&map->nodes);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
|
||||||
|
|
||||||
|
/* if the daemon already exists on this node, then
|
||||||
|
* don't include it
|
||||||
|
*/
|
||||||
|
if (node->daemon_preexists) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* otherwise, add it to the list of nodes upon which
|
||||||
|
* we need to launch a daemon
|
||||||
|
*/
|
||||||
|
opal_argv_append(&nodelist_argc, &nodelist_argv, node->nodename);
|
||||||
|
}
|
||||||
|
if (0 == opal_argv_count(nodelist_argv)) {
|
||||||
|
opal_show_help("help-pls-alps.txt", "no-hosts-in-list", true);
|
||||||
|
rc = ORTE_ERR_FAILED_TO_START;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
nodelist_flat = opal_argv_join(nodelist_argv, ',');
|
||||||
|
opal_argv_free(nodelist_argv);
|
||||||
|
asprintf(&tmp, "-L %s", nodelist_flat);
|
||||||
|
opal_argv_append(&argc, &argv, tmp);
|
||||||
|
free(tmp);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ORTED OPTIONS
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* add the daemon command (as specified by user) */
|
||||||
|
opal_argv_append(&argc, &argv, mca_pls_alps_component.orted);
|
||||||
|
|
||||||
|
/* ensure we don't lose contact */
|
||||||
|
orte_no_daemonize_flag = true;
|
||||||
|
|
||||||
|
/* Add basic orted command line options, including debug flags */
|
||||||
|
orte_pls_base_orted_append_basic_args(&argc, &argv,
|
||||||
|
&proc_name_index,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
/* force orted to use the alps sds */
|
||||||
|
opal_argv_append(&argc, &argv, "--ns-nds");
|
||||||
|
opal_argv_append(&argc, &argv, "alps");
|
||||||
|
|
||||||
|
/* tell the new daemons the base of the name list so they can compute
|
||||||
|
* their own name on the other end
|
||||||
|
*/
|
||||||
|
name.jobid = 0;
|
||||||
|
name.vpid = map->daemon_vpid_start;
|
||||||
|
rc = orte_ns.get_proc_name_string(&name_string, &name);
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "pls_alps: unable to create process name");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(argv[proc_name_index]);
|
||||||
|
argv[proc_name_index] = strdup(name_string);
|
||||||
|
free(name_string);
|
||||||
|
|
||||||
|
if (mca_pls_alps_component.debug) {
|
||||||
|
param = opal_argv_join(argv, ' ');
|
||||||
|
if (NULL != param) {
|
||||||
|
opal_output(0, "pls:alps: final top-level argv:");
|
||||||
|
opal_output(0, "pls:alps: %s", param);
|
||||||
|
free(param);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy the prefix-directory specified in the
|
||||||
|
corresponding app_context. If there are multiple,
|
||||||
|
different prefix's in the app context, complain (i.e., only
|
||||||
|
allow one --prefix option for the entire alps run -- we
|
||||||
|
don't support different --prefix'es for different nodes in
|
||||||
|
the ALPS pls) */
|
||||||
|
cur_prefix = NULL;
|
||||||
|
for (i=0; i < map->num_apps; i++) {
|
||||||
|
char * app_prefix_dir = map->apps[i]->prefix_dir;
|
||||||
|
/* Check for already set cur_prefix_dir -- if different,
|
||||||
|
complain */
|
||||||
|
if (NULL != app_prefix_dir) {
|
||||||
|
if (NULL != cur_prefix &&
|
||||||
|
0 != strcmp (cur_prefix, app_prefix_dir)) {
|
||||||
|
opal_show_help("help-pls-alps.txt", "multiple-prefixes",
|
||||||
|
true, cur_prefix, app_prefix_dir);
|
||||||
|
return ORTE_ERR_FATAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If not yet set, copy it; iff set, then it's the
|
||||||
|
same anyway */
|
||||||
|
if (NULL == cur_prefix) {
|
||||||
|
cur_prefix = strdup(app_prefix_dir);
|
||||||
|
if (mca_pls_alps_component.debug) {
|
||||||
|
opal_output (0, "pls:alps: Set prefix:%s",
|
||||||
|
cur_prefix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* setup environment */
|
||||||
|
env = opal_argv_copy(environ);
|
||||||
|
|
||||||
|
/* purge it of any params not for orteds */
|
||||||
|
orte_pls_base_purge_mca_params(&env);
|
||||||
|
|
||||||
|
/* add the nodelist */
|
||||||
|
var = mca_base_param_environ_variable("orte", "alps", "nodelist");
|
||||||
|
opal_setenv(var, nodelist_flat, true, &env);
|
||||||
|
free(nodelist_flat);
|
||||||
|
free(var);
|
||||||
|
|
||||||
|
if (mca_pls_alps_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_alps: could not obtain start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* exec the daemon(s) */
|
||||||
|
if (ORTE_SUCCESS != (rc = pls_alps_start_proc(argc, argv, env, cur_prefix))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* do NOT wait for alps to complete. Alps only completes when the processes
|
||||||
|
* it starts - in this case, the orteds - complete. Instead, we'll catch
|
||||||
|
* any alps failures and deal with them elsewhere
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* wait for daemons to callback */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_apps:
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* declare the launch a success */
|
||||||
|
failed_launch = false;
|
||||||
|
|
||||||
|
if (mca_pls_alps_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_alps: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "pls_alps: daemon block launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart.tv_usec));
|
||||||
|
opal_output(0, "pls_alps: total job launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "pls:alps: start_procs returned error %d", rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* JMS: short we stash the alps pid in the gpr somewhere for cleanup? */
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if (NULL != map) {
|
||||||
|
OBJ_RELEASE(map);
|
||||||
|
}
|
||||||
|
if (NULL != argv) {
|
||||||
|
opal_argv_free(argv);
|
||||||
|
}
|
||||||
|
if (NULL != env) {
|
||||||
|
opal_argv_free(env);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(NULL != jobid_string) {
|
||||||
|
free(jobid_string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check for failed launch - if so, force terminate */
|
||||||
|
if (failed_launch) {
|
||||||
|
orte_pls_base_daemon_failed(jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int pls_alps_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* order them to kill their local procs for this job */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Terminate the orteds for a given job
|
||||||
|
*/
|
||||||
|
static int pls_alps_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* deregister the waitpid callback to ensure we don't make it look like
|
||||||
|
* alps failed when it didn't. Since the alps may have already completed,
|
||||||
|
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
||||||
|
* messages
|
||||||
|
*/
|
||||||
|
orte_wait_cb_cancel(alps_pid);
|
||||||
|
|
||||||
|
/* tell them to die! */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The way we've used ALPS, we can't kill individual processes --
|
||||||
|
* we'll kill the entire job
|
||||||
|
*/
|
||||||
|
static int pls_alps_terminate_proc(const orte_process_name_t *name)
|
||||||
|
{
|
||||||
|
opal_output(0, "pls:alps:terminate_proc: not supported");
|
||||||
|
return ORTE_ERR_NOT_SUPPORTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Signal all the processes in the child alps by sending the signal directly to it
|
||||||
|
*/
|
||||||
|
static int pls_alps_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
|
||||||
|
{
|
||||||
|
if (0 != alps_pid) {
|
||||||
|
kill(alps_pid, (int)signal);
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Signal a specific process
|
||||||
|
*/
|
||||||
|
static int pls_alps_signal_proc(const orte_process_name_t *name, int32_t signal)
|
||||||
|
{
|
||||||
|
opal_output(0, "pls:alps:signal_proc: not supported");
|
||||||
|
return ORTE_ERR_NOT_SUPPORTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int pls_alps_finalize(void)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* cleanup any pending recvs */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||||
|
/* According to the ALPS folks, alps always returns the highest exit
|
||||||
|
code of our remote processes. Thus, a non-zero exit status doesn't
|
||||||
|
necessarily mean that alps failed - it could be that an orted returned
|
||||||
|
a non-zero exit status. Of course, that means the orted failed(!), so
|
||||||
|
the end result is the same - the job didn't start.
|
||||||
|
|
||||||
|
As a result, we really can't do much with the exit status itself - it
|
||||||
|
could be something in errno (if alps itself failed), or it could be
|
||||||
|
something returned by an orted, or it could be something returned by
|
||||||
|
the OS (e.g., couldn't find the orted binary). Somebody is welcome
|
||||||
|
to sort out all the options and pretty-print a better error message. For
|
||||||
|
now, though, the only thing that really matters is that
|
||||||
|
alps failed. Report the error and make sure that orterun
|
||||||
|
wakes up - otherwise, do nothing!
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (0 != status) {
|
||||||
|
if (failed_launch) {
|
||||||
|
/* we have a problem during launch */
|
||||||
|
opal_output(0, "ERROR: alps failed to start the required daemons.");
|
||||||
|
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
|
||||||
|
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
|
||||||
|
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
|
||||||
|
|
||||||
|
/* report that the daemon has failed so we break out of the daemon
|
||||||
|
* callback receive and exit
|
||||||
|
*/
|
||||||
|
orte_pls_base_daemon_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* an orted must have died unexpectedly after launch - report
|
||||||
|
* that the daemon has failed so we exit
|
||||||
|
*/
|
||||||
|
orte_pls_base_daemon_failed(active_job, false, pid, status, ORTE_JOB_STATE_ABORTED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int pls_alps_start_proc(int argc, char **argv, char **env,
|
||||||
|
char *prefix)
|
||||||
|
{
|
||||||
|
int fd;
|
||||||
|
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
||||||
|
|
||||||
|
if (NULL == exec_argv) {
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
alps_pid = fork();
|
||||||
|
if (-1 == alps_pid) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||||
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 == alps_pid) { /* child */
|
||||||
|
char *bin_base = NULL, *lib_base = NULL;
|
||||||
|
|
||||||
|
/* Figure out the basenames for the libdir and bindir. There
|
||||||
|
is a lengthy comment about this in pls_rsh_module.c
|
||||||
|
explaining all the rationale for how / why we're doing
|
||||||
|
this. */
|
||||||
|
|
||||||
|
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||||
|
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||||
|
|
||||||
|
/* If we have a prefix, then modify the PATH and
|
||||||
|
LD_LIBRARY_PATH environment variables. */
|
||||||
|
if (NULL != prefix) {
|
||||||
|
char *oldenv, *newenv;
|
||||||
|
|
||||||
|
/* Reset PATH */
|
||||||
|
oldenv = getenv("PATH");
|
||||||
|
if (NULL != oldenv) {
|
||||||
|
asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
|
||||||
|
} else {
|
||||||
|
asprintf(&newenv, "%s/%s", prefix, bin_base);
|
||||||
|
}
|
||||||
|
opal_setenv("PATH", newenv, true, &env);
|
||||||
|
if (mca_pls_alps_component.debug) {
|
||||||
|
opal_output(0, "pls:alps: reset PATH: %s", newenv);
|
||||||
|
}
|
||||||
|
free(newenv);
|
||||||
|
|
||||||
|
/* Reset LD_LIBRARY_PATH */
|
||||||
|
oldenv = getenv("LD_LIBRARY_PATH");
|
||||||
|
if (NULL != oldenv) {
|
||||||
|
asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
|
||||||
|
} else {
|
||||||
|
asprintf(&newenv, "%s/%s", prefix, lib_base);
|
||||||
|
}
|
||||||
|
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||||
|
if (mca_pls_alps_component.debug) {
|
||||||
|
opal_output(0, "pls:alps: reset LD_LIBRARY_PATH: %s",
|
||||||
|
newenv);
|
||||||
|
}
|
||||||
|
free(newenv);
|
||||||
|
}
|
||||||
|
|
||||||
|
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
|
||||||
|
if(fd > 0) {
|
||||||
|
dup2(fd, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* When not in debug mode and --debug-daemons was not passed,
|
||||||
|
* tie stdout/stderr to dev null so we don't see messages from orted */
|
||||||
|
if (0 == mca_pls_alps_component.debug && !orte_debug_daemons_flag) {
|
||||||
|
if (fd >= 0) {
|
||||||
|
if (fd != 1) {
|
||||||
|
dup2(fd,1);
|
||||||
|
}
|
||||||
|
if (fd != 2) {
|
||||||
|
dup2(fd,2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fd > 2) {
|
||||||
|
close(fd);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* get the alps process out of orterun's process group so that
|
||||||
|
signals sent from the shell (like those resulting from
|
||||||
|
cntl-c) don't get sent to alps */
|
||||||
|
setpgid(0, 0);
|
||||||
|
|
||||||
|
|
||||||
|
char* param = opal_argv_join(argv, ';');
|
||||||
|
execve(exec_argv, argv, env);
|
||||||
|
|
||||||
|
opal_output(0, "pls:alps:start_proc: exec failed");
|
||||||
|
/* don't return - need to exit - returning would be bad -
|
||||||
|
we're not in the calling process anymore */
|
||||||
|
exit(1);
|
||||||
|
} else { /* parent */
|
||||||
|
/* just in case, make sure that the alps process is not in our
|
||||||
|
process group any more. Stevens says always do this on both
|
||||||
|
sides of the fork... */
|
||||||
|
setpgid(alps_pid, alps_pid);
|
||||||
|
|
||||||
|
/* setup the waitpid so we can find out if alps succeeds! */
|
||||||
|
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
|
||||||
|
free(exec_argv);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
58
orte/mca/ras/alps/Makefile.am
Обычный файл
58
orte/mca/ras/alps/Makefile.am
Обычный файл
@ -0,0 +1,58 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
AM_CPPFLAGS = $(ras_alps_CPPFLAGS)
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-ras-alps.txt
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
ras_alps.h \
|
||||||
|
ras_alps_component.c \
|
||||||
|
ras_alps_module.c
|
||||||
|
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if OMPI_BUILD_ras_alps_DSO
|
||||||
|
lib =
|
||||||
|
lib_sources =
|
||||||
|
component = mca_ras_alps.la
|
||||||
|
component_sources = $(sources)
|
||||||
|
else
|
||||||
|
lib = libmca_ras_alps.la
|
||||||
|
lib_sources = $(sources)
|
||||||
|
component =
|
||||||
|
component_sources =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component)
|
||||||
|
mca_ras_alps_la_SOURCES = $(component_sources)
|
||||||
|
mca_ras_alps_la_LDFLAGS = -module -avoid-version $(ras_alps_LDFLAGS)
|
||||||
|
mca_ras_alps_la_LIBADD = \
|
||||||
|
$(ras_alps_LIBS) \
|
||||||
|
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||||
|
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(lib)
|
||||||
|
libmca_ras_alps_la_SOURCES = $(lib_sources)
|
||||||
|
libmca_ras_alps_la_LDFLAGS = -module -avoid-version $(ras_alps_LDFLAGS)
|
||||||
|
libmca_ras_alps_la_LIBADD = $(ras_alps_LIBS)
|
25
orte/mca/ras/alps/configure.m4
Обычный файл
25
orte/mca/ras/alps/configure.m4
Обычный файл
@ -0,0 +1,25 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# MCA_ras_alps_CONFIG([action-if-found], [action-if-not-found])
|
||||||
|
# -----------------------------------------------------------
|
||||||
|
AC_DEFUN([MCA_ras_alps_CONFIG],[
|
||||||
|
OMPI_CHECK_ALPS([ras_alps], [$1], [$2])
|
||||||
|
])dnl
|
23
orte/mca/ras/alps/configure.params
Обычный файл
23
orte/mca/ras/alps/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
PARAM_CONFIG_FILES="Makefile"
|
43
orte/mca/ras/alps/help-ras-alps.txt
Обычный файл
43
orte/mca/ras/alps/help-ras-alps.txt
Обычный файл
@ -0,0 +1,43 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English help file for Open MPI MCA error messages.
|
||||||
|
#
|
||||||
|
[alps-env-var-not-found]
|
||||||
|
While trying to determine what resources are available, the ALPS
|
||||||
|
resource allocator expects to find the following environment variables:
|
||||||
|
|
||||||
|
BATCH_PARTITION_ID
|
||||||
|
|
||||||
|
However, it was unable to find the following environment variable:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
#This is a fatal error.
|
||||||
|
[alps-env-var-bad-value]
|
||||||
|
While trying to determine what resources are available, the ALPS
|
||||||
|
resource allocator uses the following environment variables:
|
||||||
|
|
||||||
|
ALPS_NODELIST value: %s
|
||||||
|
ALPS_TASKS_PER_NODE value: %s
|
||||||
|
|
||||||
|
However, an error was encountered when trying to parse the following variable:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
#This is a fatal error.
|
41
orte/mca/ras/alps/ras_alps.h
Обычный файл
41
orte/mca/ras/alps/ras_alps.h
Обычный файл
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* @file
|
||||||
|
*
|
||||||
|
* Resource Allocation (ALPS)
|
||||||
|
*/
|
||||||
|
#ifndef ORTE_RAS_ALPS_H
|
||||||
|
#define ORTE_RAS_ALPS_H
|
||||||
|
|
||||||
|
#include "orte/mca/ras/ras.h"
|
||||||
|
#include "orte/mca/ras/base/base.h"
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_alps_component;
|
||||||
|
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_alps_module;
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
109
orte/mca/ras/alps/ras_alps_component.c
Обычный файл
109
orte/mca/ras/alps/ras_alps_component.c
Обычный файл
@ -0,0 +1,109 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "ras_alps.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local variables
|
||||||
|
*/
|
||||||
|
static int param_priority;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
static int ras_alps_open(void);
|
||||||
|
static orte_ras_base_module_t *ras_alps_init(int*);
|
||||||
|
|
||||||
|
|
||||||
|
orte_ras_base_component_t mca_ras_alps_component = {
|
||||||
|
/* First, the mca_base_component_t struct containing meta
|
||||||
|
information about the component itself */
|
||||||
|
|
||||||
|
{
|
||||||
|
/* Indicate that we are a ras v1.3.0 component (which also
|
||||||
|
implies a specific MCA version) */
|
||||||
|
|
||||||
|
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||||
|
|
||||||
|
/* Component name and version */
|
||||||
|
|
||||||
|
"alps",
|
||||||
|
ORTE_MAJOR_VERSION,
|
||||||
|
ORTE_MINOR_VERSION,
|
||||||
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
|
/* Component open and close functions */
|
||||||
|
|
||||||
|
ras_alps_open,
|
||||||
|
NULL
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Next the MCA v1.0.0 component meta data */
|
||||||
|
{
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
},
|
||||||
|
|
||||||
|
ras_alps_init
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static int ras_alps_open(void)
|
||||||
|
{
|
||||||
|
param_priority =
|
||||||
|
mca_base_param_reg_int(&mca_ras_alps_component.ras_version,
|
||||||
|
"priority",
|
||||||
|
"Priority of the alps ras component",
|
||||||
|
false, false, 75, NULL);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static orte_ras_base_module_t *ras_alps_init(int* priority)
|
||||||
|
{
|
||||||
|
/* if we are not an HNP, then we must not be selected */
|
||||||
|
if (!orte_process_info.seed) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Are we running under a ALPS job? */
|
||||||
|
|
||||||
|
if (NULL != getenv("BATCH_PARTITION_ID")) {
|
||||||
|
mca_base_param_lookup_int(param_priority, priority);
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps: available for selection");
|
||||||
|
return &orte_ras_alps_module;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sadly, no */
|
||||||
|
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps: NOT available for selection");
|
||||||
|
return NULL;
|
||||||
|
}
|
140
orte/mca/ras/alps/ras_alps_module.c
Обычный файл
140
orte/mca/ras/alps/ras_alps_module.c
Обычный файл
@ -0,0 +1,140 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/orte_types.h"
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "orte/dss/dss.h"
|
||||||
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/mca/ras/base/ras_private.h"
|
||||||
|
#include "ras_alps.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
static int orte_ras_alps_allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||||
|
static int orte_ras_alps_deallocate(orte_jobid_t jobid);
|
||||||
|
static int orte_ras_alps_finalize(void);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Global variable
|
||||||
|
*/
|
||||||
|
orte_ras_base_module_t orte_ras_alps_module = {
|
||||||
|
orte_ras_alps_allocate,
|
||||||
|
orte_ras_base_node_insert,
|
||||||
|
orte_ras_base_node_query,
|
||||||
|
orte_ras_base_node_query_alloc,
|
||||||
|
orte_ras_base_node_lookup,
|
||||||
|
orte_ras_base_proc_query_alloc,
|
||||||
|
orte_ras_alps_deallocate,
|
||||||
|
orte_ras_alps_finalize
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discover available (pre-allocated) nodes. Allocate the
|
||||||
|
* requested number of nodes/process slots to the job.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static int orte_ras_alps_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
char *alps_batch_id;
|
||||||
|
opal_list_t nodes;
|
||||||
|
opal_list_item_t* item;
|
||||||
|
|
||||||
|
char *alps_node_cmd_str = "apstat -a `apstat -r | grep $BATCH_PARTITION_ID | awk '{print $2}'` "
|
||||||
|
" -r -v | egrep \"(nid [0-9]+)\" -o | awk '{print $2}' > ./ompi_ras_alps_node_file";
|
||||||
|
|
||||||
|
|
||||||
|
alps_batch_id = getenv("BATCH_PARTITION_ID");
|
||||||
|
if (NULL == alps_batch_id) {
|
||||||
|
opal_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1,
|
||||||
|
"BATCH_PARTITION_ID");
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(system(alps_node_cmd_str)) {
|
||||||
|
opal_output(0, "Error in orte_ras_alps_allocate: system call returned an error, for reference I tried to run: %s",
|
||||||
|
alps_node_cmd_str);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_ras_base_read_nodename_file(&nodes, "./ompi_ras_alps_node_file"))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||||
|
|
||||||
|
ret = orte_ras_base_node_insert(&nodes);
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&nodes);
|
||||||
|
|
||||||
|
/* All done */
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS == ret) {
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps:allocate: success");
|
||||||
|
} else {
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps:allocate: failure (base_allocate_nodes=%d)", ret);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There's really nothing to do here
|
||||||
|
*/
|
||||||
|
static int orte_ras_alps_deallocate(orte_jobid_t jobid)
|
||||||
|
{
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps:deallocate: success (nothing to do)");
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There's really nothing to do here
|
||||||
|
*/
|
||||||
|
static int orte_ras_alps_finalize(void)
|
||||||
|
{
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:alps:finalize: success (nothing to do)");
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -118,3 +119,65 @@ CLEANUP:
|
|||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define RAS_BASE_FILE_MAX_LINE_LENGTH 512
|
||||||
|
|
||||||
|
static char *ras_getline(FILE *fp)
|
||||||
|
{
|
||||||
|
char *ret, *buff = NULL;
|
||||||
|
char input[RAS_BASE_FILE_MAX_LINE_LENGTH];
|
||||||
|
|
||||||
|
ret = fgets(input, RAS_BASE_FILE_MAX_LINE_LENGTH, fp);
|
||||||
|
if (NULL != ret) {
|
||||||
|
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||||
|
buff = strdup(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
return buff;
|
||||||
|
}
|
||||||
|
|
||||||
|
int orte_ras_base_read_nodename_file(opal_list_t *nodes, char *filename)
|
||||||
|
{
|
||||||
|
FILE *fp;
|
||||||
|
int32_t nodeid=0;
|
||||||
|
orte_ras_node_t *node=NULL;
|
||||||
|
char *hostname;
|
||||||
|
|
||||||
|
fp = fopen(filename, "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||||
|
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (NULL != (hostname = ras_getline(fp))) {
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:base:read_nodename: got hostname %s", hostname);
|
||||||
|
|
||||||
|
/* if this matches the prior nodename, then just add
|
||||||
|
* to the slot count
|
||||||
|
*/
|
||||||
|
if (NULL != node &&
|
||||||
|
0 == strcmp(node->node_name, hostname)) {
|
||||||
|
++node->node_slots;
|
||||||
|
/* free the hostname that came back since we don't need it */
|
||||||
|
free(hostname);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* must be a new name, so add a new item to the list */
|
||||||
|
opal_output(orte_ras_base.ras_output,
|
||||||
|
"ras:base:read_nodename: not found -- added to list");
|
||||||
|
node = OBJ_NEW(orte_ras_node_t);
|
||||||
|
node->node_name = hostname;
|
||||||
|
node->launch_id = nodeid;
|
||||||
|
node->node_slots_inuse = 0;
|
||||||
|
node->node_slots_max = 0;
|
||||||
|
node->node_slots = 1;
|
||||||
|
opal_list_append(nodes, &node->super);
|
||||||
|
/* up the nodeid */
|
||||||
|
nodeid++;
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -86,6 +87,8 @@ ORTE_DECLSPEC int orte_ras_base_set_oversubscribe_override(orte_jobid_t job);
|
|||||||
|
|
||||||
ORTE_DECLSPEC int orte_ras_base_get_oversubscribe_override(orte_jobid_t job, bool *flag);
|
ORTE_DECLSPEC int orte_ras_base_get_oversubscribe_override(orte_jobid_t job, bool *flag);
|
||||||
|
|
||||||
|
ORTE_DECLSPEC int orte_ras_base_read_nodename_file(opal_list_t *nodes, char *filename);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Query the registry for all available nodes
|
* Query the registry for all available nodes
|
||||||
*/
|
*/
|
||||||
|
51
orte/mca/sds/alps/Makefile.am
Обычный файл
51
orte/mca/sds/alps/Makefile.am
Обычный файл
@ -0,0 +1,51 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# Use the top-level Makefile.options
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
sds_alps.h \
|
||||||
|
sds_alps_component.c \
|
||||||
|
sds_alps_module.c
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if OMPI_BUILD_sds_alps_DSO
|
||||||
|
component_noinst =
|
||||||
|
component_install = mca_sds_alps.la
|
||||||
|
else
|
||||||
|
component_noinst = libmca_sds_alps.la
|
||||||
|
component_install =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_sds_alps_la_SOURCES = $(sources)
|
||||||
|
mca_sds_alps_la_LDFLAGS = -module -avoid-version
|
||||||
|
mca_sds_alps_la_LIBADD = \
|
||||||
|
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||||
|
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_sds_alps_la_SOURCES =$(sources)
|
||||||
|
libmca_sds_alps_la_LDFLAGS = -module -avoid-version
|
27
orte/mca/sds/alps/configure.m4
Обычный файл
27
orte/mca/sds/alps/configure.m4
Обычный файл
@ -0,0 +1,27 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# MCA_sds_alps_CONFIG([action-if-found], [action-if-not-found])
|
||||||
|
# -----------------------------------------------------------
|
||||||
|
AC_DEFUN([MCA_sds_alps_CONFIG],[
|
||||||
|
OMPI_CHECK_ALPS([sds_alps],
|
||||||
|
[AC_CHECK_FUNC([cnos_get_rank], [$1], [$2])],
|
||||||
|
[$2])
|
||||||
|
])dnl
|
23
orte/mca/sds/alps/configure.params
Обычный файл
23
orte/mca/sds/alps/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
PARAM_CONFIG_FILES="Makefile"
|
50
orte/mca/sds/alps/sds_alps.h
Обычный файл
50
orte/mca/sds/alps/sds_alps.h
Обычный файл
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef ORTE_SDS_ALPS_H
|
||||||
|
#define ORTE_SDS_ALPS_H
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Module open / close
|
||||||
|
*/
|
||||||
|
int orte_sds_alps_component_open(void);
|
||||||
|
int orte_sds_alps_component_close(void);
|
||||||
|
orte_sds_base_module_t* orte_sds_alps_component_init(int *priority);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Startup / Shutdown
|
||||||
|
*/
|
||||||
|
int orte_sds_alps_finalize(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Module functions
|
||||||
|
*/
|
||||||
|
int orte_sds_alps_set_name(void);
|
||||||
|
int orte_sds_alps_contact_universe(void);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* ORTE_SDS_ALPS_H */
|
106
orte/mca/sds/alps/sds_alps_component.c
Обычный файл
106
orte/mca/sds/alps/sds_alps_component.c
Обычный файл
@ -0,0 +1,106 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
* These symbols are in a file by themselves to provide nice linker
|
||||||
|
* semantics. Since linkers generally pull in symbols by object
|
||||||
|
* files, keeping these symbols as the only symbols in this file
|
||||||
|
* prevents utility programs such as "ompi_info" from having to import
|
||||||
|
* entire components just to query their version and parameters.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/mca/sds/sds.h"
|
||||||
|
#include "orte/mca/sds/alps/sds_alps.h"
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
|
extern orte_sds_base_module_t orte_sds_alps_module;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Instantiate the public struct with all of our public information
|
||||||
|
* and pointers to our public functions in it
|
||||||
|
*/
|
||||||
|
orte_sds_base_component_t mca_sds_alps_component = {
|
||||||
|
/* First, the mca_component_t struct containing meta information
|
||||||
|
about the component itself */
|
||||||
|
{
|
||||||
|
/* Indicate that we are a sds v1.0.0 component (which also
|
||||||
|
implies a specific MCA version) */
|
||||||
|
ORTE_SDS_BASE_VERSION_1_0_0,
|
||||||
|
|
||||||
|
/* Component name and version */
|
||||||
|
"alps",
|
||||||
|
ORTE_MAJOR_VERSION,
|
||||||
|
ORTE_MINOR_VERSION,
|
||||||
|
ORTE_RELEASE_VERSION,
|
||||||
|
|
||||||
|
/* Component open and close functions */
|
||||||
|
orte_sds_alps_component_open,
|
||||||
|
orte_sds_alps_component_close
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Next the MCA v1.0.0 component meta data */
|
||||||
|
{
|
||||||
|
/* The component is not checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_NONE
|
||||||
|
},
|
||||||
|
|
||||||
|
/* Initialization / querying functions */
|
||||||
|
orte_sds_alps_component_init
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
orte_sds_alps_component_open(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
orte_sds_base_module_t *
|
||||||
|
orte_sds_alps_component_init(int *priority)
|
||||||
|
{
|
||||||
|
int id;
|
||||||
|
char *mode;
|
||||||
|
|
||||||
|
/* okay, not seed/singleton attempt another approach */
|
||||||
|
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
|
||||||
|
mca_base_param_lookup_string(id, &mode);
|
||||||
|
|
||||||
|
if (NULL == mode || 0 != strcmp("alps", mode)) {
|
||||||
|
if (NULL != mode) {
|
||||||
|
free(mode);
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (NULL != mode) {
|
||||||
|
free(mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
*priority = 35;
|
||||||
|
return &orte_sds_alps_module;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
orte_sds_alps_component_close(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
84
orte/mca/sds/alps/sds_alps_module.c
Обычный файл
84
orte/mca/sds/alps/sds_alps_module.c
Обычный файл
@ -0,0 +1,84 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <catamount/cnos_mpi_os.h>
|
||||||
|
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/mca/sds/sds.h"
|
||||||
|
#include "orte/mca/sds/base/base.h"
|
||||||
|
#include "orte/mca/sds/alps/sds_alps.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/mca/ns/base/base.h"
|
||||||
|
#include "orte/mca/errmgr/base/base.h"
|
||||||
|
|
||||||
|
orte_sds_base_module_t orte_sds_alps_module = {
|
||||||
|
orte_sds_base_basic_contact_universe,
|
||||||
|
orte_sds_alps_set_name,
|
||||||
|
orte_sds_alps_finalize,
|
||||||
|
};
|
||||||
|
|
||||||
|
int
|
||||||
|
orte_sds_alps_set_name(void)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
orte_jobid_t jobid;
|
||||||
|
orte_vpid_t vpid;
|
||||||
|
|
||||||
|
if(orte_process_info.seed) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_ns.create_my_name())) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_process_info.num_procs = 1;
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get our process information
|
||||||
|
*
|
||||||
|
* we're going to make up the jobid. find our vpid,
|
||||||
|
* assuming range starts at 0
|
||||||
|
*/
|
||||||
|
jobid = 0; /* not 0, since it has special meaning */
|
||||||
|
|
||||||
|
|
||||||
|
vpid = (orte_vpid_t) cnos_get_rank() + 1;
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||||
|
jobid,
|
||||||
|
vpid))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
orte_sds_alps_finalize(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2008 UT-Battelle, LLC
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -89,7 +90,7 @@ orte_sds_cnos_component_init(int *priority)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
*priority = 60;
|
*priority = 30;
|
||||||
return &orte_sds_cnos_module;
|
return &orte_sds_cnos_module;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user