1
1

Create a new state-of-health monitoring framework and move the bproc monitoring code to it. Put in an .ompi-ignore to prevent it from being compiled for now. Lots more definition needs to be done here - design document to follow when I can get to it.

This commit was SVN r3821.
Этот коммит содержится в:
Ralph Castain 2004-12-15 18:36:21 +00:00
родитель 43b565f110
Коммит 8d94778f1e
13 изменённых файлов: 673 добавлений и 94 удалений

31
src/mca/soh/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/config/Makefile.options
SUBDIRS = base $(MCA_soh_STATIC_SUBDIRS)
DIST_SUBDIRS = base $(MCA_soh_ALL_SUBDIRS)
# Source code files
headers = soh.h
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ompidir = $(includedir)/openmpi/mca/soh
ompi_HEADERS = $(headers)
else
ompidir = $(includedir)
endif

45
src/mca/soh/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/config/Makefile.options
noinst_LTLIBRARIES = libmca_ns_base.la
# For VPATH builds, have to specify where static-modules.h will be found
AM_CPPFLAGS = -I$(top_builddir)/src
# Source code files
headers = \
base.h
# Library
libmca_ns_base_la_SOURCES = \
$(headers) \
ns_base_close.c \
ns_base_select.c \
ns_base_open.c \
ns_base_local_fns.c
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ompidir = $(includedir)/openmpi/mca/ns/base
ompi_HEADERS = $(headers)
else
ompidir = $(includedir)
endif

61
src/mca/soh/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,61 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SOH_BASE_H
#define MCA_SOH_BASE_H
/*
* includes
*/
#include "ompi_config.h"
#include "class/ompi_list.h"
#include "mca/mca.h"
#include "mca/soh/soh.h"
/*
* Global functions for MCA overall collective open and close
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OMPI_DECLSPEC int mca_soh_base_open(void);
OMPI_DECLSPEC int mca_soh_base_select(bool *allow_multi_user_threads,
bool *have_hidden_threads);
OMPI_DECLSPEC int mca_soh_base_close(void);
/*
* globals that might be needed
*/
OMPI_DECLSPEC extern int mca_soh_base_output;
OMPI_DECLSPEC extern mca_soh_base_module_t ompi_soh_monitor; /* holds selected module's function pointers */
OMPI_DECLSPEC extern bool mca_soh_base_selected;
OMPI_DECLSPEC extern ompi_list_t mca_soh_base_components_available;
OMPI_DECLSPEC extern mca_soh_base_component_t mca_soh_base_selected_component;
/*
* external API functions will be documented in the mca/soh/soh.h file
*/
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

40
src/mca/soh/base/soh_base_close.c Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/constants.h"
#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/soh/base/base.h"
int mca_soh_base_close(void)
{
/* If we have a selected component and module, then finalize it */
if (mca_soh_base_selected) {
mca_soh_base_selected_component.soh_finalize();
}
/* Close all remaining available components (may be one if this is a
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
mca_base_components_close(mca_soh_base_output,
&mca_soh_base_components_available, NULL);
/* All done */
return OMPI_SUCCESS;
}

78
src/mca/soh/base/soh_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,78 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/constants.h"
#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/base/mca_base_param.h"
#include "util/output.h"
#include "util/proc_info.h"
#include "mca/oob/base/base.h"
#include "mca/soh/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "mca/soh/base/static-components.h"
/*
* globals
*/
/*
* Global variables
*/
int mca_soh_base_output = -1;
mca_soh_base_module_t ompi_soh_monitor = {
mca_soh_base_update_cell_soh_not_available;
};
bool mca_soh_base_selected = false;
ompi_list_t mca_soh_base_components_available;
mca_soh_base_component_t mca_soh_base_selected_component;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int mca_soh_base_open(void)
{
/* Open up all available components */
if (OMPI_SUCCESS !=
mca_base_components_open("soh", 0, mca_soh_base_static_components,
&mca_soh_base_components_available)) {
return OMPI_ERROR;
}
/* setup output for debug messages */
if (!ompi_output_init) { /* can't open output */
return OMPI_ERROR;
}
mca_soh_base_output = ompi_output_open(NULL);
/* All done */
return OMPI_SUCCESS;
}

101
src/mca/soh/base/soh_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,101 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/soh/base/base.h"
/**
* Function for selecting one component from all those that are
* available.
*/
int mca_soh_base_select(bool *allow_multi_user_threads,
bool *have_hidden_threads)
{
ompi_list_item_t *item;
mca_base_component_list_item_t *cli;
mca_soh_base_component_t *component, *best_component = NULL;
mca_soh_base_module_t *module, *best_module = NULL;
bool multi, hidden;
int priority, best_priority = -1;
/* Iterate through all the available components */
for (item = ompi_list_get_first(&mca_soh_base_components_available);
item != ompi_list_get_end(&mca_soh_base_components_available);
item = ompi_list_get_next(item)) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_soh_base_component_t *) cli->cli_component;
/* Call the component's init function and see if it wants to be
selected */
module = component->soh_init(&multi, &hidden, &priority);
/* If we got a non-NULL module back, then the component wants to
be selected. So save its multi/hidden values and save the
module with the highest priority */
if (NULL != module) {
/* If this is the best one, save it */
if (priority > best_priority) {
/* If there was a previous best one, finalize */
if (NULL != best_component) {
best_component->soh_finalize();
}
/* Save the new best one */
best_module = module;
best_component = component;
*allow_multi_user_threads = multi;
*have_hidden_threads = hidden;
/* update the best priority */
best_priority = priority;
}
/* If it's not the best one, finalize it */
else {
component->soh_finalize();
}
}
}
/* If we didn't find one to select, barf */
if (NULL == best_component) {
return OMPI_ERROR;
}
/* We have happiness -- save the component and module for later
usage */
ompi_soh_monitor = *best_module;
mca_soh_base_selected_component = *best_component;
mca_soh_base_selected = true;
/* all done */
return OMPI_SUCCESS;
}

0
src/mca/soh/bproc/.ompi_ignore Обычный файл
Просмотреть файл

Просмотреть файл

230
src/mca/soh/bproc/svc_bproc_soh.c Обычный файл
Просмотреть файл

@ -0,0 +1,230 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <sys/poll.h>
#include <sys/bproc.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "include/constants.h"
#include "mca/oob/oob.h"
#include "mca/oob/base/base.h"
#include "mca/ns/base/base.h"
#include "runtime/runtime.h"
#include "svc_bproc_soh.h"
mca_svc_base_module_t mca_svc_bproc_soh_module = {
mca_svc_bproc_soh_module_init,
mca_svc_bproc_soh_module_fini
};
/*
* Add a BProc node to the virtual machine SOH segment
*/
int
mca_svc_bproc_soh_add_node(mca_ns_base_cellid_t cellid, int node)
{
ompi_rte_vm_status_t *vmdata;
int err;
vmdata = (ompi_rte_vm_status_t*)malloc(sizeof(ompi_rte_vm_status_t));
vmdata->cell = cellid;
asprintf(&(vmdata->nodename), "%d", node);
err = bproc_getnodeattr(ni->node, "cpus", &cpus, sizeof(cpus));
if (err != 0)
cpus = 1;
vmdata->cpus = (uint16_t)cpus;
}
/**
* Process a BProc update notice
*/
int
mca_svc_bproc_soh_status_changed(struct bproc_node_info_t *old, struct bproc_node_info_t *new)
{
if (old->node != new->node)
return 0;
if (strcmp(old->status, new->status))
return 1;
if (old->user != new->user)
return 1;
if (old->group != new->group)
return 1;
if (old->mode != new->mode)
return 1;
return 0;
}
void
mca_svc_bproc_soh_update_node_info(mca_ns_base_cellid_t cellid, struct bproc_node_info_t *ni)
{
int err;
int cpus;
char *node;
ompi_rte_vm_status__t *vmdata;
asprintf(&node, "%d", ni->node);
vmdata = ompi_rte_get_vm_status(cellid, node);
if (vmdata == NULL) { /* this node isn't present yet - add it */
mca_svc_bproc_soh_add_node(cellid, ni->node);
return;
/* in long-term, we will store the soh data in key-value pairs. for now,
* we store it simply as values so we can get it working - I will update
* this later to the final form.
*/
vmdata->user = ni->user;
vmdata->group = ni->group;
vmdata->mode = ni->mode;
if (NULL != vmdata->status) {
free(vmdata->status);
}
vmdata->status = strdup(ni->status);
/*
ompi_vm_status_data_add_int(vmdata, "user", ni->user);
ompi_vm_status_data_add_int(vmdata, "group", ni->group);
ompi_vm_status_data_add_int(vmdata, "mode", ni->mode);
ompi_vm_status_data_add_string(vmdata, "status", ni->status);
*/
/* probably should optimize this so it only happens once */
/* ompi_vm_status_data_add_int(vmdata, "#cpus", cpus); */
/* registry_put(segment, cell, node, vmdata); */
free(node);
ompit_vm_status_data_finish(vmdata);
}
void
mca_svc_bproc_soh_check_node_info(char *segment, char *cell,
struct bproc_node_set_t **old,
struct bproc_node_set_t *new)
{
/* we assume the number of nodes does not change */
for (i = 0; i < new->size; i++) {
ni = &new->node[i];
if (!old->size || status_changed((*old)->node[i], ni))
update_node_info(segment, cell, ni);
}
if ((*old)->size)
bproc_nodeset_free(*old);
bproc_nodeset_init(*old, new->size);
memcpy((*old)->node, new->node, sizeof(*new->node) * new->size);
}
#if OMPI_HAVE_POSIX_THREADS
static void *
mca_svc_bproc_soh_status_thread(ompi_thread_t *thread)
{
struct pollfd pfd;
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
mca_svc_bproc_soh_module_t *module = (mca_svc_bproc_soh_module_t *)thread->t_arg;
/* This thread enter in a cancel enabled state */
pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
for (;;) {
pfd.fd = module->notify_fd;
pfd.events = POLLIN;
res = poll(&pfd, 1, -1);
if (res < 0) {
/* poll error */
break;
}
if (bproc_nodelist_(&ns, module->notify_fd) < 0) {
/* bproc_nodelist_ error */
break;
}
mca_svc_bproc_soh_check_node_info(module->segment, module->cell, &module->node_info, ns);
bproc_nodeset_free(&ns);
}
return PTHREAD_CANCELED;
}
#endif /* OMPI_HAVE_POSIX_THREADS */
/**
* Register a callback to receive BProc update notifications
*/
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t* base)
{
int i;
int num_nodes;
bproc_node_set_t node_list;
int node_num;
char *segment, *jobid_string;
mca_svc_bproc_soh_module_t *module /* = somthing */;
jobid_string = ompi_name_server.get_jobid_string(ompi_rte_get_self());
asprintf(&module->segment, "%s-bproc", OMPI_RTE_VM_STATUS_SEGMENT);
module->cell = /* get cell somehow */;
num_nodes = bproc_nodelist(&module->node_info);
if (num_nodes < 0)
return OMPI_ERROR;
for (i = 0; i < module->node_info->size; i++) {
update_node_info(&module->node_info[i]);
}
module->notify_fd = bproc_notifier();
if (module->notify_fd < 0)
return OMPI_ERROR;
if (ompi_using_thread()) {
#if OMPI_HAVE_POSIX_THREADS
module->thread.t_handle = 0;
module->thread.t_run = (ompi_thread_fn_t)mca_bproc_status_thread;
module->thread.t_arg = (void *)module;
#endif /* OMPI_HAVE_POSIX_THREADS */
}
return ompi_thread_start(&module->thread);
}
/**
* Cleanup
*/
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t* base)
{
mca_svc_bproc_soh_module_t *module /* = somthing */;
#if OMPI_HAVE_POSIX_THREADS
if (module->thread.t_handle != 0) {
void *thread_return;
pthread_cancel(ptl->thread.t_handle);
ompi_thread_join(&(module->thread), &thread_return);
}
#endif /* OMPI_HAVE_POSIX_THREADS */
return OMPI_SUCCESS;
}

87
src/mca/soh/soh.h Обычный файл
Просмотреть файл

@ -0,0 +1,87 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI State-of-Health Monitoring Subsystem
*
*/
#ifndef MCA_SOH_H
#define MCA_SOH_H
/*
* includes
*/
#include "ompi_config.h"
#include "mca/mca.h"
#include "mca/ns/base/base.h"
/*
* Component functions - all MUST be provided!
*/
/* Update the state-of-health of a cell
*/
typedef int (*mca_soh_base_module_update_cell_soh_fn_t)(mca_ns_base_cellid_t cellid);
/*
* Ver 1.0.0
*/
struct mca_soh_base_module_1_0_0_t {
mca_soh_base_module_update_cell_soh_fn_t update_cell_soh;
};
typedef struct mca_soh_base_module_1_0_0_t mca_soh_base_module_1_0_0_t;
typedef mca_soh_base_module_1_0_0_t mca_soh_base_module_t;
/*
* SOH Component
*/
typedef mca_soh_base_module_t* (*mca_soh_base_component_init_fn_t)(
bool *allow_multi_user_threads,
bool *have_hidden_threads,
int *priority);
typedef int (*mca_soh_base_component_finalize_fn_t)(void);
/*
* the standard component data structure
*/
struct mca_soh_base_component_1_0_0_t {
mca_base_component_t soh_version;
mca_base_component_data_1_0_0_t soh_data;
mca_soh_base_component_init_fn_t soh_init;
mca_soh_base_component_finalize_fn_t soh_finalize;
};
typedef struct mca_soh_base_component_1_0_0_t mca_soh_base_component_1_0_0_t;
typedef mca_soh_base_component_1_0_0_t mca_soh_base_component_t;
/*
* Macro for use in components that are of type ns v1.0.0
*/
#define MCA_SOH_BASE_VERSION_1_0_0 \
/* soh v1.0 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* soh v1.0 */ \
"soh", 1, 0, 0
#endif

Просмотреть файл

@ -1,94 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <sys/bproc.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "include/constants.h"
#include "mca/oob/oob.h"
#include "mca/oob/base/base.h"
#include "svc_bproc_soh.h"
mca_svc_base_module_t mca_svc_bproc_soh_module = {
mca_svc_bproc_soh_module_init,
mca_svc_bproc_soh_module_fini
};
/**
* Process a BProc update notice
*/
static void mca_svc_bproc_soh_cbfunc()
{
}
/**
* Register a callback to receive BProc update notifications
*/
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t* module)
{
bool registration_successful=true; /* added strictly to allow compilation
- should be removed by Greg/Nathan */
bproc_node_info_t node_info;
int node_num;
char *segment, *jobid_string;
jobid_string = ompi_name_server.get_jobid_string(ompi_rte_get_self());
asprintf(&segment, "%s-bproc", OMPI_RTE_VM_STATUS_SEGMENT);
/* Greg/Nathan - we need to initialize a registry segment that
* has info from each node on the BProc cluster. From what I read
* in the BProc documentation, we want each process to call this
* function and add that info to our segment. Please feel free
* to correct this info if incorrect...
*/
node_num = bproc_currnode();
/* Greg/Nathan - this is where you need to add code so that
* BProc will call you back whenever there is a change
* or info that you want to get. I have named the callback
* function "mca_svc_bproc_soh_cbfunc".
*/
if (registration_successful) {
return OMPI_SUCCESS;
} else {
return OMPI_ERROR;
}
}
/**
* Cleanup
*/
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t* module)
{
/* Greg/Nathan - all you need to do here is de-register the
* callback from BProc.
*/
return OMPI_SUCCESS;
}