1
1

Remove the fddp and sensor frameworks - relocated to new cluster mgr project

This commit was SVN r22240.
Этот коммит содержится в:
Ralph Castain 2009-11-27 22:14:47 +00:00
родитель 7cf427c39b
Коммит e38a0eab9f
44 изменённых файлов: 0 добавлений и 2484 удалений

Просмотреть файл

@ -20,7 +20,6 @@ enable_ft_thread=no
enable_per_user_config_files=no
enable_script_wrapper_compilers=yes
enable_multicast=yes
enable_monitoring=yes
enable_orterun_prefix_by_default=yes
enable_io_romio=no
#enable_mca_direct=ras-cm,rmaps-resilient,routed-cm

Просмотреть файл

@ -1,4 +1,3 @@
enable_monitoring=yes
enable_multicast=yes
with_memory_manager=no
enable_mem_debug=yes

Просмотреть файл

@ -1,4 +1,3 @@
enable_monitoring=yes
enable_multicast=yes
with_memory_manager=no
enable_mem_debug=no

Просмотреть файл

@ -1,23 +0,0 @@
enable_monitoring=yes
enable_multicast=yes
with_memory_manager=no
enable_mem_debug=yes
enable_mem_profile=no
enable_debug_symbols=yes
enable_binaries=yes
with_devel_headers=yes
enable_heterogeneous=no
enable_picky=yes
enable_debug=yes
enable_shared=yes
enable_static=no
enable_contrib_no_build=libnbc,vt
with_xgrid=no
enable_io_romio=no
enable_ipv6=no
enable_mpi_f77=no
enable_mpi_f90=no
enable_mpi_cxx=yes
enable_mpi_cxx_seek=yes
enable_memchecker=no
enable_mca_no_build=carto,crs,memchecker,rmaps-load_balance,rmaps-round_robin,rmaps-seq,rmaps-topo,rmaps-rank_file,filem,plm-slurm,plm-xgrid,snapc,grpcomm-basic,grpcomm-hier,pml-dr,pml-crcp2,pml-cm,crcp,pml-v

Просмотреть файл

@ -1,63 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the default system-wide MCA parameters defaults file.
# Specifically, the MCA parameter "mca_param_files" defaults to a
# value of
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
# (this file is the latter of the two). So if the default value of
# mca_param_files is not changed, this file is used to set system-wide
# MCA parameters. This file can therefore be used to set system-wide
# default MCA parameters for all users. Of course, users can override
# these values if they want, but this file is an excellent location
# for setting system-specific MCA parameters for those users who don't
# know / care enough to investigate the proper values for them.
# Note that this file is only applicable where it is visible (in a
# filesystem sense). Specifically, MPI processes each read this file
# during their startup to determine what default values for MCA
# parameters should be used. mpirun does not bundle up the values in
# this file from the node where it was run and send them to all nodes;
# the default value decisions are effectively distributed. Hence,
# these values are only applicable on nodes that "see" this file. If
# $sysconf is a directory on a local disk, it is likely that changes
# to this file will need to be propagated to other nodes. If $sysconf
# is a directory that is shared via a networked filesystem, changes to
# this file will be visible to all nodes that share this $sysconf.
# The format is straightforward: one per line, mca_param_name =
# rvalue. Quoting is ignored (so if you use quotes or escape
# characters, they'll be included as part of the value). For example:
# Disable run-time MPI parameter checking
# mpi_param_check = 0
# Note that the value "~/" will be expanded to the current user's home
# directory. For example:
# Change component loading path
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
# See "ompi_info --param all all" for a full listing of Open MPI MCA
# parameters available and their default values.
#
# Basic behavior to smooth startup
mca_component_show_load_errors = 0
orte_abort_timeout = 10
# ORTE behavior
plm = rsh
## Add the interface for out-of-band communication
## and set it up
oob_tcp_listen_mode = listen_thread
oob_tcp_sndbuf = 32768
oob_tcp_rcvbuf = 32768

Просмотреть файл

@ -77,24 +77,6 @@ AC_DEFINE_UNQUOTED([ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT],
[$orte_want_orterun_prefix_by_default],
[Whether we want orterun to effect "--prefix $prefix" by default])
#
# Do we want monitoring enabled?
#
AC_MSG_CHECKING([if want state-of-health monitoring])
AC_ARG_ENABLE([monitoring],
[AC_HELP_STRING([--enable-monitoring],
[Enable monitoring of process and hardware state-of-health (default: disabled)])])
if test "$enable_monitoring" = "yes"; then
AC_MSG_RESULT([yes])
orte_want_monitoring=1
else
AC_MSG_RESULT([no])
orte_want_monitoring=0
fi
AC_DEFINE_UNQUOTED([ORTE_ENABLE_MONITORING], [$orte_want_monitoring],
[Enable state-of-health monitoring of processes and hardware])
#
# Do we want reliable multicast enabled?
#

Просмотреть файл

@ -57,10 +57,6 @@
#include "orte/util/regex.h"
#include "orte/util/show_help.h"
#include "orte/mca/notifier/base/base.h"
#if ORTE_ENABLE_MONITORING
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/fddp/base/base.h"
#endif
#if ORTE_ENABLE_MULTICAST
#include "orte/mca/rmcast/base/base.h"
#endif
@ -375,32 +371,6 @@ int orte_ess_base_orted_setup(char **hosts)
goto error;
}
#if ORTE_ENABLE_MONITORING
/* setup the sensors */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
/* setup the fddp */
if (ORTE_SUCCESS != (ret = orte_fddp_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_fddp_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
#endif
return ORTE_SUCCESS;
error:
@ -428,13 +398,6 @@ int orte_ess_base_orted_finalize(void)
orte_grpcomm.onesided_barrier();
}
#if ORTE_ENABLE_MONITORING
/* finalize the sensors */
orte_sensor_base_close();
/* finalize the fddp */
orte_fddp_base_close();
#endif
orte_notifier_base_close();
orte_cr_finalize();

Просмотреть файл

@ -52,10 +52,6 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/notifier/base/base.h"
#if ORTE_ENABLE_MONITORING
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/fddp/base/base.h"
#endif
#if ORTE_ENABLE_MULTICAST
#include "orte/mca/rmcast/base/base.h"
#endif
@ -525,33 +521,7 @@ static int rte_init(void)
goto error;
}
}
#if ORTE_ENABLE_MONITORING
/* setup the sensors */
if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
/* setup the fddp */
if (ORTE_SUCCESS != (ret = orte_fddp_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_fddp_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sensor_select";
goto error;
}
#endif
/* We actually do *not* want an HNP to voluntarily yield() the
processor more than necessary. Orterun already blocks when
it is doing nothing, so it doesn't use any more CPU cycles than
@ -602,13 +572,6 @@ static int rte_finalize(void)
unlink(contact_path);
free(contact_path);
#if ORTE_ENABLE_MONITORING
/* finalize the sensors */
orte_sensor_base_close();
/* finalize the fddp */
orte_fddp_base_close();
#endif
orte_notifier_base_close();
orte_cr_finalize();

Просмотреть файл

@ -1,34 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_fddp.la
libmca_fddp_la_SOURCES =
# header setup
nobase_orte_HEADERS =
# local files
headers = fddp.h
libmca_fddp_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/fddp
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

Просмотреть файл

@ -1,17 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_fddp_la_SOURCES += \
base/fddp_base_close.c \
base/fddp_base_select.c \
base/fddp_base_open.c

Просмотреть файл

@ -1,55 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_FDDP_BASE_H
#define MCA_FDDP_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/mca/fddp/fddp.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/*
* function definitions
*/
ORTE_DECLSPEC int orte_fddp_base_open(void);
ORTE_DECLSPEC int orte_fddp_base_select(void);
ORTE_DECLSPEC int orte_fddp_base_close(void);
/*
* globals that might be needed
*/
ORTE_DECLSPEC extern int orte_fddp_base_output;
ORTE_DECLSPEC extern bool mca_fddp_base_selected;
ORTE_DECLSPEC extern opal_list_t mca_fddp_base_components_available;
ORTE_DECLSPEC extern orte_fddp_base_component_t mca_fddp_base_selected_component;
#if !ORTE_DISABLE_FULL_SUPPORT
/* no base functions to protect at this time */
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

Просмотреть файл

@ -1,38 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/fddp/base/base.h"
int orte_fddp_base_close(void)
{
/* If we have a selected component and module, then finalize it */
if (NULL != orte_fddp.finalize) {
orte_fddp.finalize();
}
/* Close all remaining available components (may be one if this is a
OpenRTE program, or [possibly] multiple if this is ompi_info) */
mca_base_components_close(orte_fddp_base_output,
&mca_fddp_base_components_available, NULL);
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,66 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/fddp/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/fddp/base/static-components.h"
/*
* Global variables
*/
int orte_fddp_base_output = -1;
orte_fddp_base_module_t orte_fddp;
opal_list_t mca_fddp_base_components_available;
orte_fddp_base_component_t mca_fddp_base_selected_component;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_fddp_base_open(void)
{
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_fddp_base_output = opal_output_open(NULL);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("fddp", orte_fddp_base_output,
mca_fddp_base_static_components,
&mca_fddp_base_components_available, true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,78 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/fddp/base/base.h"
/**
* Function for selecting one component from all those that are
* available.
*/
int orte_fddp_base_select(void)
{
int ret, exit_status = ORTE_SUCCESS;
orte_fddp_base_component_t *best_component = NULL;
orte_fddp_base_module_t *best_module = NULL;
char *include_list = NULL;
/*
* Register the framework MCA param and look up include list
*/
mca_base_param_reg_string_name("fddp", NULL,
"Which fddp component to use (empty = none)",
false, false,
NULL, &include_list);
/* If we do not have any components to select this is ok. Just use the default
* "no-op" component and move on.
*/
if( 0 >= opal_list_get_size(&mca_fddp_base_components_available) || NULL == include_list) {
/* Close all components since none will be used */
mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */
&mca_fddp_base_components_available,
NULL);
goto cleanup;
}
/*
* Select the best component
*/
if( ORTE_SUCCESS != mca_base_select("fddp", orte_fddp_base_output,
&mca_fddp_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* It is okay if no component was selected - we just leave
* the orte_fddp module as the default
*/
exit_status = ORTE_SUCCESS;
goto cleanup;
}
if (NULL != orte_fddp.init) {
/* if an init function is provided, use it */
if (ORTE_SUCCESS != (ret = orte_fddp.init()) ) {
exit_status = ret;
goto cleanup;
}
}
/* Save the winner */
orte_fddp = *best_module;
cleanup:
return exit_status;
}

Просмотреть файл

@ -1,81 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_FDDP_H
#define MCA_FDDP_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/mca/sensor/sensor_types.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_fddp_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef int (*orte_fddp_base_module_finalize_fn_t)(void);
typedef int (*orte_fddp_base_module_process_fn_t)(orte_sensor_data_t *data,
int num_bins, uint8_t *failure_likelihood);
/*
* Ver 1.0
*/
struct orte_fddp_base_module_1_0_0_t {
orte_fddp_base_module_init_fn_t init;
orte_fddp_base_module_finalize_fn_t finalize;
orte_fddp_base_module_process_fn_t process;
};
typedef struct orte_fddp_base_module_1_0_0_t orte_fddp_base_module_1_0_0_t;
typedef orte_fddp_base_module_1_0_0_t orte_fddp_base_module_t;
/*
* the standard component data structure
*/
struct orte_fddp_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_fddp_base_component_1_0_0_t orte_fddp_base_component_1_0_0_t;
typedef orte_fddp_base_component_1_0_0_t orte_fddp_base_component_t;
/*
* Macro for use in components that are of type fddp v1.0.0
*/
#define ORTE_FDDP_BASE_VERSION_1_0_0 \
/* fddp v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* fddp v1.0 */ \
"fddp", 1, 0, 0
/* Global structure for accessing fddp functions
*/
ORTE_DECLSPEC extern orte_fddp_base_module_t orte_fddp; /* holds selected module's function pointers */
END_C_DECLS
#endif /* MCA_FDDP_H */

Просмотреть файл

@ -1,35 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
fddp_trend.c \
fddp_trend.h \
fddp_trend_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_fddp_trend_DSO
component_noinst =
component_install = mca_fddp_trend.la
else
component_noinst = libmca_fddp_trend.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_fddp_trend_la_SOURCES = $(sources)
mca_fddp_trend_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_fddp_trend_la_SOURCES =$(sources)
libmca_fddp_trend_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,69 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/sensor_types.h"
#include "fddp_trend.h"
static int init(void);
static int finalize(void);
static int process(orte_sensor_data_t *data, int num_bins, uint8_t *failure_likelihood);
orte_fddp_base_module_t orte_fddp_trend_module = {
init,
finalize,
process
};
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int process(orte_sensor_data_t *data, int num_bins, uint8_t *failure_likelihood)
{
/* the failure likelihood in this model is just the trended value of the
* data itself, scaled appropriately
*/
/* using the sliding window, compute the trend of the data */
/* for each point in future time, compute the predicted value of
* the sensor reading
*/
/* scale it by the provided scaling factors */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef ORTE_FDDP_TREND_H
#define ORTE_FDDP_TREND_H
#include "orte_config.h"
#include "orte/mca/fddp/fddp.h"
BEGIN_C_DECLS
struct orte_fddp_trend_component_t {
orte_fddp_base_component_t super;
int window_size;
};
typedef struct orte_fddp_trend_component_t orte_fddp_trend_component_t;
ORTE_MODULE_DECLSPEC extern orte_fddp_trend_component_t mca_fddp_trend_component;
extern orte_fddp_base_module_t orte_fddp_trend_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,92 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "fddp_trend.h"
/*
* Local functions
*/
static int orte_fddp_trend_open(void);
static int orte_fddp_trend_close(void);
static int orte_fddp_trend_query(mca_base_module_t **module, int *priority);
orte_fddp_trend_component_t mca_fddp_trend_component = {
{
{
ORTE_FDDP_BASE_VERSION_1_0_0,
"trend", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_fddp_trend_open, /* component open */
orte_fddp_trend_close, /* component close */
orte_fddp_trend_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_fddp_trend_open(void)
{
mca_base_component_t *c = &mca_fddp_trend_component.super.base_version;
/* lookup parameters */
mca_base_param_reg_int(c, "window_size",
"Size of sliding window to smooth data for trend [default: 1]",
false, false, 80, &mca_fddp_trend_component.window_size);
return ORTE_SUCCESS;
}
static int orte_fddp_trend_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_fddp_trend_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_fddp_trend_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,35 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
# header setup
nobase_orte_HEADERS =
# local files
headers = sensor.h \
sensor_types.h
libmca_sensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/sensor
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

Просмотреть файл

@ -1,27 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_sensor_la_SOURCES += \
base/sensor_base_open.c
if !ORTE_DISABLE_FULL_SUPPORT
headers += \
base/sensor_private.h
libmca_sensor_la_SOURCES += \
base/sensor_base_close.c \
base/sensor_base_select.c \
base/sensor_base_scale.c
endif

Просмотреть файл

@ -1,62 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/*
* function definitions
*/
ORTE_DECLSPEC int orte_sensor_base_open(void);
ORTE_DECLSPEC int orte_sensor_base_select(void);
ORTE_DECLSPEC int orte_sensor_base_close(void);
/*
* globals that might be needed
*/
ORTE_DECLSPEC extern int orte_sensor_base_output;
ORTE_DECLSPEC extern opal_list_t mca_sensor_base_components_available;
ORTE_DECLSPEC extern opal_list_t orte_sensor_base_selected_modules;
/* object definition */
typedef struct {
opal_list_item_t super;
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
} orte_sensor_base_selected_pair_t;
OBJ_CLASS_DECLARATION(orte_sensor_base_selected_pair_t);
#if !ORTE_DISABLE_FULL_SUPPORT
/* no base functions to protect at this time */
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

Просмотреть файл

@ -1,39 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/sensor/base/base.h"
int orte_sensor_base_close(void)
{
opal_list_item_t *item;
/* destruct the list of modules so they each can finalize */
while (NULL != (item = opal_list_remove_first(&orte_sensor_base_selected_modules))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_sensor_base_selected_modules);
/* Close all remaining available components */
mca_base_components_close(orte_sensor_base_output,
&mca_sensor_base_components_available, NULL);
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,125 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/* object definition */
static void construct(orte_sensor_base_selected_pair_t *obj)
{
obj->component = NULL;
obj->module = NULL;
}
static void destruct(orte_sensor_base_selected_pair_t *obj)
{
if (NULL != obj->module->finalize) {
obj->module->finalize();
}
}
OBJ_CLASS_INSTANCE(orte_sensor_base_selected_pair_t,
opal_list_item_t,
construct, destruct);
/* base functions */
static void start(void);
static void stop(void);
/*
* Global variables
*/
int orte_sensor_base_output = -1;
orte_sensor_base_API_module_t orte_sensor = {
start,
stop
};
opal_list_t mca_sensor_base_components_available;
opal_list_t orte_sensor_base_selected_modules;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_sensor_base_open(void)
{
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_sensor_base_output = opal_output_open(NULL);
/* construct the list of modules */
OBJ_CONSTRUCT(&orte_sensor_base_selected_modules, opal_list_t);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("sensor", orte_sensor_base_output,
mca_sensor_base_static_components,
&mca_sensor_base_components_available, true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}
static void start(void)
{
orte_sensor_base_selected_pair_t *pair;
opal_list_item_t *item;
for (item = opal_list_get_first(&orte_sensor_base_selected_modules);
opal_list_get_end(&orte_sensor_base_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_sensor_base_selected_pair_t*)item;
if (NULL != pair->module->start) {
pair->module->start();
}
}
return;
}
static void stop(void)
{
orte_sensor_base_selected_pair_t *pair;
opal_list_item_t *item;
for (item = opal_list_get_first(&orte_sensor_base_selected_modules);
opal_list_get_end(&orte_sensor_base_selected_modules) != item;
item = opal_list_get_next(item)) {
pair = (orte_sensor_base_selected_pair_t*)item;
if (NULL != pair->module->stop) {
pair->module->stop();
}
}
return;
}

Просмотреть файл

@ -1,47 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/sensor_private.h"
int orte_sensor_scale_data(orte_sensor_data_t *target, int num_values, float *data)
{
int i;
/* ensure we have enough data storage in the sensor data object */
if (NULL != target->data.bytes) {
/* clear out pre-existing data */
free(target->data.bytes);
}
/* allocate what we need */
target->data.bytes = (uint8_t*)malloc(num_values * sizeof(uint8_t));
memset(target->data.bytes, 0, num_values);
target->data.size = num_values;
/* convert the data */
for (i=0; i < num_values; i++) {
target->data.bytes[i] = UINT8_MAX * (data[i] - target->min) / (target->max - target->min);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,128 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "orte/mca/sensor/base/base.h"
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_base_module_t *nmodule;
opal_list_item_t *item;
int i, priority, ret;
char *include_list = NULL;
char **imodules = NULL;
orte_sensor_base_selected_pair_t *pair;
/*
* Register the framework MCA param and look up include list
*/
mca_base_param_reg_string_name("sensor", NULL,
"Comma-delimisted list of sensor component to use (empty = all avail)",
false, false,
NULL, &include_list);
/* if the list is empty, then we have nothing to do */
if (NULL == include_list) {
return ORTE_SUCCESS;
}
/* separate the names of the sensors to be used */
imodules = opal_argv_split(include_list, ',');
/* Query all available components and ask if they have a module */
for (item = opal_list_get_first(&mca_sensor_base_components_available);
opal_list_get_end(&mca_sensor_base_components_available) != item;
item = opal_list_get_next(item)) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/* If this component was not specified, skip it */
for (i = 0; NULL != imodules[i]; ++i) {
if (0 == strcmp(imodules[i], component->mca_component_name)) {
break;
}
}
if (NULL == imodules[i]) {
continue;
}
/* If there's no query function, skip it */
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_sensor_base_output,
"mca:sensor:select: Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/* Query the component */
opal_output_verbose(5, orte_sensor_base_output,
"mca:sensor:select: Querying component [%s]",
component->mca_component_name);
ret = component->mca_query_component(&module, &priority);
/* If no module was returned, then skip component */
if (ORTE_SUCCESS != ret || NULL == module) {
opal_output_verbose(5, orte_sensor_base_output,
"mca:sensor:select: Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/* If we got a module, initialize it */
nmodule = (orte_sensor_base_module_t*) module;
if (NULL != nmodule->init) {
/* If the module doesn't want to be used, skip it */
if (ORTE_SUCCESS != (ret = nmodule->init()) ) {
if (NULL != nmodule->finalize) {
nmodule->finalize();
}
continue;
}
}
opal_output_verbose(5, orte_sensor_base_output,
"mca:sensor:select: Adding component [%s] to active list",
component->mca_component_name );
/* Make an item for the list */
pair = OBJ_NEW(orte_sensor_base_selected_pair_t);
pair->component = (orte_sensor_base_component_t*) component;
pair->module = nmodule;
/* Add it to the list of operational sensors */
opal_list_append(&orte_sensor_base_selected_modules, &(pair->super));
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,40 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/dss/dss_types.h"
#include "orte/mca/sensor/sensor_types.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
#if !ORTE_DISABLE_FULL_SUPPORT
/*
* function definitions
*/
ORTE_DECLSPEC int orte_sensor_scale_data(orte_sensor_data_t *target, int num_values, float *data);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

Просмотреть файл

@ -1,41 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(sensor_crsvm_CPPFLAGS)
dist_pkgdata_DATA = help-orte-sensor-crsvm.txt
sources = \
sensor_crsvm.c \
sensor_crsvm.h \
sensor_crsvm_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_crsvm_DSO
component_noinst =
component_install = mca_sensor_crsvm.la
else
component_noinst = libmca_sensor_crsvm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_crsvm_la_SOURCES = $(sources)
mca_sensor_crsvm_la_LDFLAGS = -module -avoid-version $(sensor_crsvm_LDFLAGS)
mca_sensor_crsvm_la_LIBADD = $(sensor_crsvm_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_crsvm_la_SOURCES =$(sources)
libmca_sensor_crsvm_la_LDFLAGS = -module -avoid-version $(sensor_crsvm_LDFLAGS)
libmca_sensor_crsvm_la_LIBADD = $(sensor_crsvm_LIBS)

Просмотреть файл

@ -1,144 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2007 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# OMPI_CHECK_CLIB(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if clib (Eliots programming library) support can be found.
# sets prefix_{CPPFLAGS, LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_CLIB],[
AC_ARG_WITH([clib],
[AC_HELP_STRING([--with-clib(=DIR)],
[Build CLIB (Eliots programming library) support, searching for libraries in DIR])])
AC_ARG_WITH([clib-libdir],
[AC_HELP_STRING([--with-clib-libdir=DIR],
[Search for CLIB (Eliots programming library) libraries in DIR])])
AS_IF([test "$with_clib" != "no"],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "yes"],
[ompi_check_clib_dir="$with_clib"])
AS_IF([test ! -z "$with_clib_libdir" -a "$with_clib_libdir" != "yes"],
[ompi_check_clib_libdir="$with_clib_libdir"])
OMPI_CHECK_PACKAGE([$1],
[clib/error.h],
[clib],
[clib_error_free_vector],
,
[$ompi_check_clib_dir],
[$ompi_check_clib_libdir],
[ompi_check_clib_happy="yes"],
[ompi_check_clib_happy="no"])
],
[ompi_check_clib_happy="no"])
AS_IF([test "$ompi_check_clib_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "no"],
[AC_MSG_ERROR([CLIB (Eliots programming library) support requested but not found. Aborting])])
$3])
])
# OMPI_CHECK_CRSVM(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if crsvm (CRS Shared Virtual Memory) support can be found.
# sets prefix_{CPPFLAGS, LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_CRSVM],[
AC_ARG_WITH([clib],
[AC_HELP_STRING([--with-clib(=DIR)],
[Build CLIB (Eliots programming library) support, searching for libraries in DIR])])
AC_ARG_WITH([clib-libdir],
[AC_HELP_STRING([--with-clib-libdir=DIR],
[Search for CLIB (Eliots programming library) libraries in DIR])])
AC_ARG_WITH([crsvm],
[AC_HELP_STRING([--with-crsvm(=DIR)],
[Build CRS SVM (Shared Virtual Memory) support, searching for libraries in DIR])])
AC_ARG_WITH([crsvm-libdir],
[AC_HELP_STRING([--with-crsvm-libdir=DIR],
[Search for CRS SVM (Shared Virtual Memory) libraries in DIR])])
AS_IF([test "$with_crsvm" != "no"],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "yes"],
[ompi_check_clib_dir="$with_clib"])
AS_IF([test ! -z "$with_clib_libdir" -a "$with_clib_libdir" != "yes"],
[ompi_check_clib_libdir="$with_clib_libdir"])
AS_IF([test ! -z "$with_crsvm" -a "$with_crsvm" != "yes"],
[ompi_check_crsvm_dir="$with_crsvm"])
AS_IF([test ! -z "$with_crsvm_libdir" -a "$with_crsvm_libdir" != "yes"],
[ompi_check_crsvm_libdir="$with_crsvm_libdir"])
ompi_check_crsvm_$1_save_CPPFLAGS="$CPPFLAGS"
ompi_check_crsvm_$1_save_LDFLAGS="$LDFLAGS"
ompi_check_crsvm_$1_save_LIBS="$LIBS"
OMPI_CHECK_PACKAGE([$1],
[clib/error.h],
[clib],
[clib_error_free_vector],
[-lpthread],
[$ompi_check_clib_dir],
[$ompi_check_clib_libdir],
[OMPI_CHECK_PACKAGE([$1],
[svmdb.h],
[svmdb],
[svmdb_map],
[-lsvm -lclib -lpthread -lrt],
[$ompi_check_crsvm_dir],
[$ompi_check_crsvm_libdir],
[ompi_check_crsvm_happy="yes"
ompi_check_clib_happy="yes"])],
[ompi_check_crsvm_happy="no"
ompi_check_clib_happy="no"])
CPPFLAGS="$ompi_check_crsvm_$1_save_CPPFLAGS"
LDFLAGS="$ompi_check_crsvm_$1_save_LDFLAGS"
LIBS="$ompi_check_crsvm_$1_save_LIBS"
],
[ompi_check_crsvm_happy="no"])
AS_IF([test "$ompi_check_crsvm_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_crsvm" -a "$with_crsvm" != "no"],
[AS_IF([test "$ompi_check_clib_happy" = "yes"],
[AC_MSG_ERROR([CRS SVM (Shared Virtual Memory) support requested but not found. Aborting])],
[AC_MSG_ERROR([CLIB (Eliots programming library) support required but not found. Aborting])]
)])
$3])
])
# MCA_sensor_crsvm_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sensor_crsvm_CONFIG], [
OMPI_CHECK_CRSVM([sensor_crsvm],
[sensor_crsvm_happy="yes"],
[sensor_crsvm_happy="no"])
AS_IF([test "$sensor_crsvm_happy" = "yes"],
[sensor_crsvm_WRAPPER_EXTRA_LDFLAGS="$sensor_crsvm_LDFLAGS"
sensor_crsvm_WRAPPER_EXTRA_LIBS="$sensor_crsvm_LIBS"
$1],
[$2])
# substitute in the things needed to build crsvm
AC_SUBST([sensor_crsvm_CFLAGS])
AC_SUBST([sensor_crsvm_CPPFLAGS])
AC_SUBST([sensor_crsvm_LDFLAGS])
AC_SUBST([sensor_crsvm_LIBS])
])dnl

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,52 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the resilient mapper.
#
[orte-rmaps-resilient:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-resilient:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-resilient:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-resilient:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.
#
[orte-rmaps-resilient:file-not-found]
The specified file that describes the fault groups for this system:
FILE: %s
was not found. Please verify the file name and location.

Просмотреть файл

@ -1,211 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/fddp/fddp.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "sensor_crsvm.h"
#include <clib/clib.h>
#include <clib/vec.h>
#include <clib/hash.h>
#include <clib/bitmap.h>
#include <clib/fifo.h>
#include <clib/time.h>
#include <clib/mheap.h>
#include <clib/heap.h>
#include <clib/pool.h>
#include <clib/format.h>
#include <svmdb.h>
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(void);
static void stop(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_crsvm_module = {
init,
finalize,
start,
stop
};
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_pointer_array_t killarray;
static bool sampling = false;
static svmdb_client_t *svmm = NULL;
static int init(void)
{
/* setup in case we have to kill someone */
OBJ_CONSTRUCT(&killarray, opal_pointer_array_t);
opal_pointer_array_init(&killarray, 16, INT_MAX, 16);
/* Map the shared Memory */
svmm = svmdb_map(0);
start(); /* TIM: temporary hack to self-start the sensor module. */
return ORTE_SUCCESS;
}
static void finalize(void)
{
stop(); /* TIM: temporary hack to self-stop the sensor module. */
/* unmap the shared Memory */
if (NULL != svmm) {
svmdb_unmap(svmm);
svmm = NULL;
}
OBJ_DESTRUCT(&killarray);
return;
}
/*
* Start monitoring of local processes
*/
static void start(void)
{
if (!sampling && 0 < mca_sensor_crsvm_component.sample_rate) {
/* startup a timer to wake us up periodically
* for a data sample
*/
sampling = true;
ORTE_TIMER_EVENT(mca_sensor_crsvm_component.sample_rate, 0, sample);
}
return;
}
static void stop(void)
{
sampling = false;
return;
}
static void sample(int fd, short event, void *arg)
{
opal_list_item_t *item;
orte_odls_child_t *child;
opal_pstats_t stats;
orte_proc_t *proc;
bool killreqd = false;
int i, rc;
double celsius = -100.0; /* clearly a bogus value */
/* if we are not sampling any more, then just return */
if (!sampling) {
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm sampling sensors"));
/* for each sensor */ {
char *ascii = svmdb_local_get_string_variable(svmm, "tempInlet0");
if (NULL != ascii) {
celsius = atof(ascii);
vec_free(ascii);
}
}
if (celsius > mca_sensor_crsvm_component.celsius_limit) {
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got temperature of %3.2f celsius, over-limit",
celsius));
/* we should notify the CM to not schedule new jobs to this node */
} else {
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got temperature of %3.2f celsius",
celsius));
/* we might want to notify the CM that this node is healthy? */
}
/* loop through our local children (because we can...) */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* get the process resource utilization stats (because we can...) */
if (ORTE_SUCCESS != (rc = opal_pstat.query(child->pid, &stats))) {
ORTE_ERROR_LOG(rc);
/* no point in continuing sampling */
sampling = false;
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got cpu time of %lu seconds for proc %s",
(unsigned long)stats.time, ORTE_NAME_PRINT(child->name)));
/* check the temperature limit */
if (celsius > mca_sensor_crsvm_component.celsius_limit) {
/* temperature limit exceeded - schedule proc to be killed */
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = child->name->jobid;
proc->name.vpid = child->name->vpid;
opal_pointer_array_add(&killarray, proc);
killreqd = true;
continue;
}
}
if (killreqd) {
/* order the local termination of the specified procs,
* and have the HNP alerted to their death
*/
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm killing procs"));
orte_odls.kill_local_procs(&killarray, true);
/* clean out the array for re-use */
for (i=0; i < killarray.size; i++) {
if (NULL != (proc = opal_pointer_array_get_item(&killarray, i))) {
OBJ_RELEASE(proc);
opal_pointer_array_set_item(&killarray, i, NULL);
}
}
}
/* restart the timer */
ORTE_TIMER_EVENT(mca_sensor_crsvm_component.sample_rate, 0, sample);
}

Просмотреть файл

@ -1,37 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_CRS_H
#define ORTE_SENSOR_CRS_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_crsvm_component_t {
orte_sensor_base_component_t super;
int sample_rate;
int celsius_limit;
};
typedef struct orte_sensor_crsvm_component_t orte_sensor_crsvm_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_crsvm_component_t mca_sensor_crsvm_component;
extern orte_sensor_base_module_t orte_sensor_crsvm_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,95 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_crsvm.h"
/*
* Local functions
*/
static int orte_sensor_crsvm_open(void);
static int orte_sensor_crsvm_close(void);
static int orte_sensor_crsvm_query(mca_base_module_t **module, int *priority);
orte_sensor_crsvm_component_t mca_sensor_crsvm_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"crsvm", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_crsvm_open, /* component open */
orte_sensor_crsvm_close, /* component close */
orte_sensor_crsvm_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_crsvm_open(void)
{
mca_base_component_t *c = &mca_sensor_crsvm_component.super.base_version;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_crsvm_component.sample_rate);
mca_base_param_reg_int(c, "celsius_limit",
"Max temperature in celsius (default=50)",
false, false, 50, &mca_sensor_crsvm_component.celsius_limit);
return ORTE_SUCCESS;
}
static int orte_sensor_crsvm_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_sensor_crsvm_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_crsvm_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,37 +0,0 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-sensor-pru.txt
sources = \
sensor_pru.c \
sensor_pru.h \
sensor_pru_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_pru_DSO
component_noinst =
component_install = mca_sensor_pru.la
else
component_noinst = libmca_sensor_pru.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_pru_la_SOURCES = $(sources)
mca_sensor_pru_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_pru_la_SOURCES =$(sources)
libmca_sensor_pru_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,14 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,52 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the resilient mapper.
#
[orte-rmaps-resilient:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-resilient:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-resilient:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-resilient:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.
#
[orte-rmaps-resilient:file-not-found]
The specified file that describes the fault groups for this system:
FILE: %s
was not found. Please verify the file name and location.

Просмотреть файл

@ -1,176 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/fddp/fddp.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "sensor_pru.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(void);
static void stop(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_pru_module = {
init,
finalize,
start,
stop
};
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_pointer_array_t killarray;
static bool sampling = false;
static int init(void)
{
/* setup in case we have to kill someone */
OBJ_CONSTRUCT(&killarray, opal_pointer_array_t);
opal_pointer_array_init(&killarray, 16, INT_MAX, 16);
return ORTE_SUCCESS;
}
static void finalize(void)
{
OBJ_DESTRUCT(&killarray);
return;
}
/*
* Start monitoring of local processes
*/
static void start(void)
{
if (!sampling && 0 < mca_sensor_pru_component.sample_rate) {
/* startup a timer to wake us up periodically
* for a data sample
*/
sampling = true;
ORTE_TIMER_EVENT(mca_sensor_pru_component.sample_rate, 0, sample);
}
return;
}
static void stop(void)
{
sampling = false;
return;
}
static void sample(int fd, short event, void *arg)
{
opal_list_item_t *item;
orte_odls_child_t *child;
opal_pstats_t stats;
orte_proc_t *proc;
bool killreqd = false;
int i, rc;
/* if we are not sampling any more, then just return */
if (!sampling) {
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:pru sampling resource usage"));
/* loop through our local children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* get the process resource utilization stats */
if (ORTE_SUCCESS != (rc = opal_pstat.query(child->pid, &stats))) {
ORTE_ERROR_LOG(rc);
/* no point in continuing sampling */
sampling = false;
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:pru got memory size of %lu Gbytes for proc %s",
(unsigned long)stats.vsize/1000000, ORTE_NAME_PRINT(child->name)));
/* check the memory size for limit */
if ((stats.vsize/1000000) > mca_sensor_pru_component.memory_limit) {
/* memory limit exceeded - schedule proc to be killed */
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:pru proc %s has exceeded memory limit of %lu Gbytes",
ORTE_NAME_PRINT(child->name),
(unsigned long)mca_sensor_pru_component.memory_limit));
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = child->name->jobid;
proc->name.vpid = child->name->vpid;
opal_pointer_array_add(&killarray, proc);
killreqd = true;
continue;
}
/* check memory size trends */
/* does trend cross limits in time window */
}
if (killreqd) {
/* order the local termination of the specified procs,
* and have the HNP alerted to their death
*/
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:pru killing procs"));
orte_odls.kill_local_procs(&killarray, true);
/* clean out the array for re-use */
for (i=0; i < killarray.size; i++) {
if (NULL != (proc = opal_pointer_array_get_item(&killarray, i))) {
OBJ_RELEASE(proc);
opal_pointer_array_set_item(&killarray, i, NULL);
}
}
}
/* restart the timer */
ORTE_TIMER_EVENT(mca_sensor_pru_component.sample_rate, 0, sample);
}

Просмотреть файл

@ -1,37 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_PRU_H
#define ORTE_SENSOR_PRU_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_pru_component_t {
orte_sensor_base_component_t super;
int sample_rate;
uint64_t memory_limit;
};
typedef struct orte_sensor_pru_component_t orte_sensor_pru_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_pru_component_t mca_sensor_pru_component;
extern orte_sensor_base_module_t orte_sensor_pru_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,102 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_pru.h"
/*
* Local functions
*/
static int orte_sensor_pru_open(void);
static int orte_sensor_pru_close(void);
static int orte_sensor_pru_query(mca_base_module_t **module, int *priority);
orte_sensor_pru_component_t mca_sensor_pru_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"pru", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_pru_open, /* component open */
orte_sensor_pru_close, /* component close */
orte_sensor_pru_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_pru_open(void)
{
mca_base_component_t *c = &mca_sensor_pru_component.super.base_version;
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_pru_component.sample_rate);
mca_base_param_reg_int(c, "memory_limit",
"Max virtual memory size in GBytes (default=10)",
false, false, 10, &tmp);
if (tmp < 0) {
opal_output(0, "Illegal value %d - must be > 0", tmp);
return ORTE_ERR_FATAL;
}
mca_sensor_pru_component.memory_limit = tmp;
return ORTE_SUCCESS;
}
static int orte_sensor_pru_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_sensor_pru_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_pru_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,96 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_SENSOR_H
#define MCA_SENSOR_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_sensor_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
/* start collecting data */
typedef void (*orte_sensor_base_module_start_fn_t)(void);
/* stop collecting data */
typedef void (*orte_sensor_base_module_stop_fn_t)(void);
/* API module */
/*
* Ver 1.0
*/
struct orte_sensor_base_API_module_1_0_0_t {
orte_sensor_base_module_start_fn_t start;
orte_sensor_base_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
/*
* Component modules Ver 1.0
*/
struct orte_sensor_base_module_1_0_0_t {
orte_sensor_base_module_init_fn_t init;
orte_sensor_base_module_finalize_fn_t finalize;
orte_sensor_base_module_start_fn_t start;
orte_sensor_base_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
/*
* the standard component data structure
*/
struct orte_sensor_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
/* sensor v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* sensor v1.0 */ \
"sensor", 1, 0, 0
/* Global structure for accessing sensor functions
*/
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

Просмотреть файл

@ -1,54 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_SENSOR_TYPES_H
#define ORTE_MCA_SENSOR_TYPES_H
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/dss/dss_types.h"
/*
* General SENSOR types - instanced in runtime/orte_globals.c
*/
BEGIN_C_DECLS
enum {
ORTE_SENSOR_SCALE_LINEAR,
ORTE_SENSOR_SCALE_LOG,
ORTE_SENSOR_SCALE_SIGMOID
};
/*
* Structure for passing data from sensors
*/
typedef struct {
opal_object_t super;
char *sensor;
struct timeval timestamp;
int scaling_law;
float min;
float max;
float gain;
opal_byte_object_t data;
} orte_sensor_data_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
END_C_DECLS
#endif

Просмотреть файл

@ -35,7 +35,6 @@
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/sensor_types.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
@ -984,27 +983,4 @@ OBJ_CLASS_INSTANCE(orte_regex_node_t,
orte_regex_node_construct,
orte_regex_node_destruct);
static void orte_sensor_data_construct(orte_sensor_data_t *ptr)
{
ptr->sensor = NULL;
ptr->scaling_law = ORTE_SENSOR_SCALE_LINEAR;
ptr->min = 0.0;
ptr->max = 100.0;
ptr->gain = 1.0;
ptr->data.size = 0;
ptr->data.bytes = NULL;
}
static void orte_sensor_data_destruct(orte_sensor_data_t *ptr)
{
if (NULL != ptr->sensor) {
free(ptr->sensor);
}
if (NULL != ptr->data.bytes) {
free(ptr->data.bytes);
}
}
OBJ_CLASS_INSTANCE(orte_sensor_data_t,
opal_object_t,
orte_sensor_data_construct,
orte_sensor_data_destruct);
#endif