1
1

Add a new sensor component that pulls data via an external shared memory interface

Only builds when the appropriate library is present

This commit was SVN r22114.
Этот коммит содержится в:
Ralph Castain 2009-10-20 23:45:35 +00:00
родитель 214e26b539
Коммит ee82d42a1c
8 изменённых файлов: 603 добавлений и 3 удалений

41
orte/mca/sensor/crsvm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(sensor_crsvm_CPPFLAGS)
dist_pkgdata_DATA = help-orte-sensor-crsvm.txt
sources = \
sensor_crsvm.c \
sensor_crsvm.h \
sensor_crsvm_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_sensor_crsvm_DSO
component_noinst =
component_install = mca_sensor_crsvm.la
else
component_noinst = libmca_sensor_crsvm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_crsvm_la_SOURCES = $(sources)
mca_sensor_crsvm_la_LDFLAGS = -module -avoid-version $(sensor_crsvm_LDFLAGS)
mca_sensor_crsvm_la_LIBADD = $(sensor_crsvm_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_crsvm_la_SOURCES =$(sources)
libmca_sensor_crsvm_la_LDFLAGS = -module -avoid-version $(sensor_crsvm_LDFLAGS)
libmca_sensor_crsvm_la_LIBADD = $(sensor_crsvm_LIBS)

144
orte/mca/sensor/crsvm/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,144 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2007 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# OMPI_CHECK_CLIB(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if clib (Eliots programming library) support can be found.
# sets prefix_{CPPFLAGS, LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_CLIB],[
AC_ARG_WITH([clib],
[AC_HELP_STRING([--with-clib(=DIR)],
[Build CLIB (Eliots programming library) support, searching for libraries in DIR])])
AC_ARG_WITH([clib-libdir],
[AC_HELP_STRING([--with-clib-libdir=DIR],
[Search for CLIB (Eliots programming library) libraries in DIR])])
AS_IF([test "$with_clib" != "no"],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "yes"],
[ompi_check_clib_dir="$with_clib"])
AS_IF([test ! -z "$with_clib_libdir" -a "$with_clib_libdir" != "yes"],
[ompi_check_clib_libdir="$with_clib_libdir"])
OMPI_CHECK_PACKAGE([$1],
[clib/error.h],
[clib],
[clib_error_free_vector],
,
[$ompi_check_clib_dir],
[$ompi_check_clib_libdir],
[ompi_check_clib_happy="yes"],
[ompi_check_clib_happy="no"])
],
[ompi_check_clib_happy="no"])
AS_IF([test "$ompi_check_clib_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "no"],
[AC_MSG_ERROR([CLIB (Eliots programming library) support requested but not found. Aborting])])
$3])
])
# OMPI_CHECK_CRSVM(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if crsvm (CRS Shared Virtual Memory) support can be found.
# sets prefix_{CPPFLAGS, LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_CRSVM],[
AC_ARG_WITH([clib],
[AC_HELP_STRING([--with-clib(=DIR)],
[Build CLIB (Eliots programming library) support, searching for libraries in DIR])])
AC_ARG_WITH([clib-libdir],
[AC_HELP_STRING([--with-clib-libdir=DIR],
[Search for CLIB (Eliots programming library) libraries in DIR])])
AC_ARG_WITH([crsvm],
[AC_HELP_STRING([--with-crsvm(=DIR)],
[Build CRS SVM (Shared Virtual Memory) support, searching for libraries in DIR])])
AC_ARG_WITH([crsvm-libdir],
[AC_HELP_STRING([--with-crsvm-libdir=DIR],
[Search for CRS SVM (Shared Virtual Memory) libraries in DIR])])
AS_IF([test "$with_crsvm" != "no"],
[AS_IF([test ! -z "$with_clib" -a "$with_clib" != "yes"],
[ompi_check_clib_dir="$with_clib"])
AS_IF([test ! -z "$with_clib_libdir" -a "$with_clib_libdir" != "yes"],
[ompi_check_clib_libdir="$with_clib_libdir"])
AS_IF([test ! -z "$with_crsvm" -a "$with_crsvm" != "yes"],
[ompi_check_crsvm_dir="$with_crsvm"])
AS_IF([test ! -z "$with_crsvm_libdir" -a "$with_crsvm_libdir" != "yes"],
[ompi_check_crsvm_libdir="$with_crsvm_libdir"])
ompi_check_crsvm_$1_save_CPPFLAGS="$CPPFLAGS"
ompi_check_crsvm_$1_save_LDFLAGS="$LDFLAGS"
ompi_check_crsvm_$1_save_LIBS="$LIBS"
OMPI_CHECK_PACKAGE([$1],
[clib/error.h],
[clib],
[clib_error_free_vector],
[-lpthread],
[$ompi_check_clib_dir],
[$ompi_check_clib_libdir],
[OMPI_CHECK_PACKAGE([$1],
[svmdb.h],
[svmdb],
[svmdb_map],
[-lsvm -lclib -lpthread -lrt],
[$ompi_check_crsvm_dir],
[$ompi_check_crsvm_libdir],
[ompi_check_crsvm_happy="yes"
ompi_check_clib_happy="yes"])],
[ompi_check_crsvm_happy="no"
ompi_check_clib_happy="no"])
CPPFLAGS="$ompi_check_crsvm_$1_save_CPPFLAGS"
LDFLAGS="$ompi_check_crsvm_$1_save_LDFLAGS"
LIBS="$ompi_check_crsvm_$1_save_LIBS"
],
[ompi_check_crsvm_happy="no"])
AS_IF([test "$ompi_check_crsvm_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_crsvm" -a "$with_crsvm" != "no"],
[AS_IF([test "$ompi_check_clib_happy" = "yes"],
[AC_MSG_ERROR([CRS SVM (Shared Virtual Memory) support requested but not found. Aborting])],
[AC_MSG_ERROR([CLIB (Eliots programming library) support required but not found. Aborting])]
)])
$3])
])
# MCA_sensor_crsvm_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_sensor_crsvm_CONFIG], [
OMPI_CHECK_CRSVM([sensor_crsvm],
[sensor_crsvm_happy="yes"],
[sensor_crsvm_happy="no"])
AS_IF([test "$sensor_crsvm_happy" = "yes"],
[sensor_crsvm_WRAPPER_EXTRA_LDFLAGS="$sensor_crsvm_LDFLAGS"
sensor_crsvm_WRAPPER_EXTRA_LIBS="$sensor_crsvm_LIBS"
$1],
[$2])
# substitute in the things needed to build crsvm
AC_SUBST([sensor_crsvm_CFLAGS])
AC_SUBST([sensor_crsvm_CPPFLAGS])
AC_SUBST([sensor_crsvm_LDFLAGS])
AC_SUBST([sensor_crsvm_LIBS])
])dnl

14
orte/mca/sensor/crsvm/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,14 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,52 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the resilient mapper.
#
[orte-rmaps-resilient:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-resilient:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-resilient:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-resilient:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-resilient:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.
#
[orte-rmaps-resilient:file-not-found]
The specified file that describes the fault groups for this system:
FILE: %s
was not found. Please verify the file name and location.

211
orte/mca/sensor/crsvm/sensor_crsvm.c Обычный файл
Просмотреть файл

@ -0,0 +1,211 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/fddp/fddp.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "sensor_crsvm.h"
#include <clib/clib.h>
#include <clib/vec.h>
#include <clib/hash.h>
#include <clib/bitmap.h>
#include <clib/fifo.h>
#include <clib/time.h>
#include <clib/mheap.h>
#include <clib/heap.h>
#include <clib/pool.h>
#include <clib/format.h>
#include <svmdb.h>
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(void);
static void stop(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_crsvm_module = {
init,
finalize,
start,
stop
};
/* declare the local functions */
static void sample(int fd, short event, void *arg);
/* local globals */
static opal_pointer_array_t killarray;
static bool sampling = false;
static svmdb_client_t *svmm = NULL;
static int init(void)
{
/* setup in case we have to kill someone */
OBJ_CONSTRUCT(&killarray, opal_pointer_array_t);
opal_pointer_array_init(&killarray, 16, INT_MAX, 16);
/* Map the shared Memory */
svmm = svmdb_map(0);
start(); /* TIM: temporary hack to self-start the sensor module. */
return ORTE_SUCCESS;
}
static void finalize(void)
{
stop(); /* TIM: temporary hack to self-stop the sensor module. */
/* unmap the shared Memory */
if (NULL != svmm) {
svmdb_unmap(svmm);
svmm = NULL;
}
OBJ_DESTRUCT(&killarray);
return;
}
/*
* Start monitoring of local processes
*/
static void start(void)
{
if (!sampling && 0 < mca_sensor_crsvm_component.sample_rate) {
/* startup a timer to wake us up periodically
* for a data sample
*/
sampling = true;
ORTE_TIMER_EVENT(mca_sensor_crsvm_component.sample_rate, 0, sample);
}
return;
}
static void stop(void)
{
sampling = false;
return;
}
static void sample(int fd, short event, void *arg)
{
opal_list_item_t *item;
orte_odls_child_t *child;
opal_pstats_t stats;
orte_proc_t *proc;
bool killreqd = false;
int i, rc;
double celsius = -100.0; /* clearly a bogus value */
/* if we are not sampling any more, then just return */
if (!sampling) {
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm sampling sensors"));
/* for each sensor */ {
char *ascii = svmdb_local_get_string_variable(svmm, "tempInlet0");
if (NULL != ascii) {
celsius = atof(ascii);
vec_free(ascii);
}
}
if (celsius > mca_sensor_crsvm_component.celsius_limit) {
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got temperature of %3.2f celsius, over-limit",
celsius));
/* we should notify the CM to not schedule new jobs to this node */
} else {
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got temperature of %3.2f celsius",
celsius));
/* we might want to notify the CM that this node is healthy? */
}
/* loop through our local children (because we can...) */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* get the process resource utilization stats (because we can...) */
if (ORTE_SUCCESS != (rc = opal_pstat.query(child->pid, &stats))) {
ORTE_ERROR_LOG(rc);
/* no point in continuing sampling */
sampling = false;
return;
}
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm got cpu time of %lu seconds for proc %s",
(unsigned long)stats.time, ORTE_NAME_PRINT(child->name)));
/* check the temperature limit */
if (celsius > mca_sensor_crsvm_component.celsius_limit) {
/* temperature limit exceeded - schedule proc to be killed */
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = child->name->jobid;
proc->name.vpid = child->name->vpid;
opal_pointer_array_add(&killarray, proc);
killreqd = true;
continue;
}
}
if (killreqd) {
/* order the local termination of the specified procs,
* and have the HNP alerted to their death
*/
OPAL_OUTPUT_VERBOSE((0, orte_sensor_base_output,
"sample:crsvm killing procs"));
orte_odls.kill_local_procs(&killarray, true);
/* clean out the array for re-use */
for (i=0; i < killarray.size; i++) {
if (NULL != (proc = opal_pointer_array_get_item(&killarray, i))) {
OBJ_RELEASE(proc);
opal_pointer_array_set_item(&killarray, i, NULL);
}
}
}
/* restart the timer */
ORTE_TIMER_EVENT(mca_sensor_crsvm_component.sample_rate, 0, sample);
}

37
orte/mca/sensor/crsvm/sensor_crsvm.h Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_CRS_H
#define ORTE_SENSOR_CRS_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_crsvm_component_t {
orte_sensor_base_component_t super;
int sample_rate;
int celsius_limit;
};
typedef struct orte_sensor_crsvm_component_t orte_sensor_crsvm_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_crsvm_component_t mca_sensor_crsvm_component;
extern orte_sensor_base_module_t orte_sensor_crsvm_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,95 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_crsvm.h"
/*
* Local functions
*/
static int orte_sensor_crsvm_open(void);
static int orte_sensor_crsvm_close(void);
static int orte_sensor_crsvm_query(mca_base_module_t **module, int *priority);
orte_sensor_crsvm_component_t mca_sensor_crsvm_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"crsvm", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_crsvm_open, /* component open */
orte_sensor_crsvm_close, /* component close */
orte_sensor_crsvm_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
}
};
/**
* component open/close/init function
*/
static int orte_sensor_crsvm_open(void)
{
mca_base_component_t *c = &mca_sensor_crsvm_component.super.base_version;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_crsvm_component.sample_rate);
mca_base_param_reg_int(c, "celsius_limit",
"Max temperature in celsius (default=50)",
false, false, 50, &mca_sensor_crsvm_component.celsius_limit);
return ORTE_SUCCESS;
}
static int orte_sensor_crsvm_query(mca_base_module_t **module, int *priority)
{
*priority = 0; /* select only if specified */
*module = (mca_base_module_t *)&orte_sensor_crsvm_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_crsvm_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -63,15 +63,21 @@ orte_sensor_pru_component_t mca_sensor_pru_component = {
static int orte_sensor_pru_open(void)
{
mca_base_component_t *c = &mca_sensor_pru_component.super.base_version;
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "sample_rate",
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_pru_component.sample_rate);
"Sample rate in seconds (default=10)",
false, false, 10, &mca_sensor_pru_component.sample_rate);
mca_base_param_reg_int(c, "memory_limit",
"Max virtual memory size in GBytes (default=10)",
false, false, 10, &mca_sensor_pru_component.sample_rate);
false, false, 10, &tmp);
if (tmp < 0) {
opal_output(0, "Illegal value %d - must be > 0", tmp);
return ORTE_ERR_FATAL;
}
mca_sensor_pru_component.memory_limit = tmp;
return ORTE_SUCCESS;
}