1
1

Add a new framework to ORTE for saving and recovering state information. Two components are included that use the db or dbm library for storing the data, with a distributed hash table component coming later.

Note that each of these components will only be selected if specifically requested - otherwise, a "NULL" component will be used.  The framework is only opened by the HNP and orteds, though neither is currently coded to save/restore state

This commit was SVN r22839.
Этот коммит содержится в:
Ralph Castain 2010-03-16 20:59:48 +00:00
родитель 7b3ac4fb73
Коммит ffd5be6aa1
22 изменённых файлов: 1287 добавлений и 2 удалений

Просмотреть файл

@ -525,7 +525,7 @@ AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
sys/types.h sys/uio.h net/uio.h sys/utsname.h sys/vfs.h sys/wait.h syslog.h \ sys/types.h sys/uio.h net/uio.h sys/utsname.h sys/vfs.h sys/wait.h syslog.h \
time.h termios.h ulimit.h unistd.h util.h utmp.h malloc.h \ time.h termios.h ulimit.h unistd.h util.h utmp.h malloc.h \
ifaddrs.h sys/sysctl.h crt_externs.h regex.h \ ifaddrs.h sys/sysctl.h crt_externs.h regex.h \
ioLib.h sockLib.h hostLib.h shlwapi.h sys/synch.h]) ioLib.h sockLib.h hostLib.h shlwapi.h sys/synch.h limits.h db.h ndbm.h])
# Needed to work around Darwin requiring sys/socket.h for # Needed to work around Darwin requiring sys/socket.h for
# net/if.h # net/if.h
@ -656,7 +656,7 @@ OMPI_CHECK_FUNC_LIB([dirname], [gen])
# Darwin doesn't need -lm, as it's a symlink to libSystem.dylib # Darwin doesn't need -lm, as it's a symlink to libSystem.dylib
OMPI_CHECK_FUNC_LIB([ceil], [m]) OMPI_CHECK_FUNC_LIB([ceil], [m])
AC_CHECK_FUNCS([asprintf snprintf vasprintf vsnprintf openpty isatty getpwuid fork waitpid execve pipe ptsname setsid mmap tcgetpgrp posix_memalign strsignal sysconf syslog regcmp regexec regfree _NSGetEnviron socketpair strncpy_s _strdup usleep mkfifo]) AC_CHECK_FUNCS([asprintf snprintf vasprintf vsnprintf openpty isatty getpwuid fork waitpid execve pipe ptsname setsid mmap tcgetpgrp posix_memalign strsignal sysconf syslog regcmp regexec regfree _NSGetEnviron socketpair strncpy_s _strdup usleep mkfifo dbopen dbm_open])
# On some hosts, htonl is a define, so the AC_CHECK_FUNC will get # On some hosts, htonl is a define, so the AC_CHECK_FUNC will get
# confused. On others, it's in the standard library, but stubbed with # confused. On others, it's in the standard library, but stubbed with

Просмотреть файл

@ -59,6 +59,7 @@
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h" #include "orte/mca/rmcast/base/base.h"
#include "orte/mca/state/base/base.h"
#include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_cr.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -388,6 +389,18 @@ int orte_ess_base_orted_setup(char **hosts)
goto error; goto error;
} }
/* setup the state framework */
if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_select";
goto error;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
@ -415,6 +428,7 @@ int orte_ess_base_orted_finalize(void)
orte_grpcomm.onesided_barrier(); orte_grpcomm.onesided_barrier();
} }
orte_state_base_close();
orte_notifier_base_close(); orte_notifier_base_close();
orte_cr_finalize(); orte_cr_finalize();

Просмотреть файл

@ -54,6 +54,7 @@
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h" #include "orte/mca/rmcast/base/base.h"
#include "orte/mca/state/base/base.h"
#include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1 #if OPAL_ENABLE_FT_CR == 1
@ -547,6 +548,18 @@ static int rte_init(void)
goto error; goto error;
} }
/* setup the state framework */
if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_select";
goto error;
}
/* if a tool has launched us and is requesting event reports, /* if a tool has launched us and is requesting event reports,
* then set its contact info into the comm system * then set its contact info into the comm system
*/ */
@ -607,6 +620,7 @@ static int rte_finalize(void)
unlink(contact_path); unlink(contact_path);
free(contact_path); free(contact_path);
orte_state_base_close();
orte_notifier_base_close(); orte_notifier_base_close();
orte_cr_finalize(); orte_cr_finalize();

35
orte/mca/state/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(LTDLINCL)
# main library setup
noinst_LTLIBRARIES = libmca_state.la
libmca_state_la_SOURCES =
# header setup
nobase_orte_HEADERS =
dist_pkgdata_DATA =
# local files
headers = state.h
libmca_state_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/state
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

16
orte/mca/state/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,16 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_state_la_SOURCES += \
base/state_base_open.c \
base/state_base_close.c \
base/state_base_select.c

54
orte/mca/state/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,54 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_STATE_BASE_H
#define MCA_STATE_BASE_H
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Global functions for MCA overall collective open and close
*/
/**
* Open the state framework
*/
ORTE_DECLSPEC int orte_state_base_open(void);
/**
* Select a state module
*/
ORTE_DECLSPEC int orte_state_base_select(void);
/**
* Close the state framework
*/
ORTE_DECLSPEC int orte_state_base_close(void);
/*
* The verbose channel for debug output
*/
ORTE_DECLSPEC extern int orte_state_base_output;
ORTE_DECLSPEC extern opal_list_t orte_state_base_components_available;
END_C_DECLS
#endif

50
orte/mca/state/base/state_base_close.c Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_component_repository.h"
#include "orte/mca/state/base/base.h"
extern opal_list_t orte_state_base_components_available;
int
orte_state_base_close(void)
{
opal_list_item_t *item;
mca_base_component_list_item_t *cli;
if (NULL != orte_state.finalize) {
orte_state.finalize();
}
/* unload all remaining components */
while (NULL != (item = opal_list_remove_first(&orte_state_base_components_available))) {
orte_state_base_component_t* component;
cli = (mca_base_component_list_item_t *) item;
component = (orte_state_base_component_t*) cli->cli_component;
opal_output_verbose(10, 0,
"orte_state_base_close: module %s unloaded",
component->base_version.mca_component_name);
mca_base_component_repository_release((mca_base_component_t *) component);
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_state_base_components_available);
opal_output_close(orte_state_base_output);
return ORTE_SUCCESS;
}

85
orte/mca/state/base/state_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,85 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/dss/dss_types.h"
#include "orte/mca/state/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* module's public mca_base_module_t struct.
*/
#include "orte/mca/state/base/static-components.h"
opal_list_t orte_state_base_components_available;
/* provide "NULL" functions */
static int init(void);
static int finalize(void);
static int save(void *object, opal_data_type_t type);
static int set_recover_source(orte_process_name_t *name);
static int recover(void *object, opal_data_type_t type);
orte_state_base_module_t orte_state = {
init,
finalize,
save,
set_recover_source,
recover
};
int orte_state_base_output;
int
orte_state_base_open(void)
{
orte_state_base_output = opal_output_open(NULL);
OBJ_CONSTRUCT(&orte_state_base_components_available, opal_list_t);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("state", orte_state_base_output, mca_state_base_static_components,
&orte_state_base_components_available,
true)) {
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int save(void *object, opal_data_type_t type)
{
return ORTE_SUCCESS;
}
static int set_recover_source(orte_process_name_t *name)
{
return ORTE_SUCCESS;
}
static int recover(void *object, opal_data_type_t type)
{
return ORTE_SUCCESS;
}

48
orte/mca/state/base/state_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,48 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_component_repository.h"
#include "orte/mca/state/base/base.h"
extern opal_list_t orte_state_base_components_available;
int
orte_state_base_select(void)
{
orte_state_base_component_t *best_component = NULL;
orte_state_base_module_t *best_module = NULL;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("state", orte_state_base_output,
&orte_state_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* It is okay to not select a component - default
* to using the base NULL component
*/
return ORTE_SUCCESS;
}
/* Save and init the winner */
orte_state = *best_module;
if (NULL != orte_state.init) {
orte_state.init();
}
return ORTE_SUCCESS;
}

34
orte/mca/state/db/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
state_db.h \
state_db_component.c \
state_db.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_state_db_DSO
component_noinst =
component_install = mca_state_db.la
else
component_noinst = libmca_state_db.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_state_db_la_SOURCES = $(sources)
mca_state_db_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_state_db_la_SOURCES =$(sources)
libmca_state_db_la_LDFLAGS = -module -avoid-version

16
orte/mca/state/db/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,16 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# MCA_state_db_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_state_db_CONFIG], [
# only build if db.h is found
AC_CHECK_HEADERS([db.h], [$1], [$2], [AC_INCLUDES_DEFAULT])
])dnl

11
orte/mca/state/db/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,11 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

248
orte/mca/state/db/state_db.c Обычный файл
Просмотреть файл

@ -0,0 +1,248 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#include <stdio.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <db.h>
#include "opal/dss/dss_types.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/malloc.h"
#include "opal/util/basename.h"
#include "opal/mca/pstat/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/mca/sysinfo/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "state_db.h"
static int init(void);
static int finalize(void);
static int save(void *object, opal_data_type_t type);
static int set_recover_source(orte_process_name_t *name);
static int recover(void *object, opal_data_type_t type);
orte_state_base_module_t orte_state_db_module = {
init,
finalize,
save,
set_recover_source,
recover
};
/* local variables */
static DB *save_db=NULL, *recover_db=NULL;
static int init(void)
{
char *path, *name;
/* setup the database */
if (ORTE_SUCCESS != opal_os_dirpath_create(orte_state_db_directory, S_IRWXU)) {
orte_show_help("help-state-db.txt", "cannot-create-dir", true,
orte_state_db_directory);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
orte_util_convert_process_name_to_string(&name, ORTE_PROC_MY_NAME);
path = opal_os_path(false, orte_state_db_directory, name, NULL);
free(name);
if (NULL == (save_db = dbopen(path, O_CREAT | O_RDWR | O_TRUNC, S_IRWXU, DB_HASH, NULL))) {
orte_show_help("help-state-db.txt", "cannot-create-db", true, path);
free(path);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
free(path);
return ORTE_SUCCESS;
}
static int finalize(void)
{
/* if we are normally terminating, remove the recovery file */
return ORTE_SUCCESS;
}
static int save(void *object, opal_data_type_t type)
{
DBT key, data;
opal_buffer_t buf;
orte_job_t *jdata;
orte_proc_t *proc;
char *name;
int rc=ORTE_SUCCESS, size;
/* construct the buffer we will use for packing the data */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
key.data = NULL;
data.data = NULL;
switch (type) {
case ORTE_JOB:
jdata = (orte_job_t*)object;
opal_dss.pack(&buf, &jdata->state, 1, ORTE_JOB_STATE_T);
asprintf((char**)&key.data, "JOB:%s", ORTE_JOBID_PRINT(jdata->jobid));
key.size = strlen(key.data);
break;
case ORTE_PROC:
proc = (orte_proc_t*)object;
opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE_T);
orte_util_convert_process_name_to_string(&name, &proc->name);
asprintf((char**)&key.data, "PROC:%s", name);
free(name);
key.size = strlen(key.data);
break;
default:
orte_show_help("help-state-db.txt", "unrecognized-type", true, type);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
break;
}
/* unload the data */
opal_dss.unload(&buf, (void**)&data.data, &size);
data.size = size;
OBJ_DESTRUCT(&buf);
/* put the info into the db */
if (0 > save_db->put(save_db, &key, &data, 0)) {
orte_show_help("help-state-db.txt", "error-writing-db", true, (char*)key.data, strerror(errno));
rc = ORTE_ERR_FILE_WRITE_FAILURE;
}
/* sync it to force it to disk */
if (0 > save_db->sync(save_db, 0)) {
orte_show_help("help-state-db.txt", "error-syncing-db", true, (char*)key.data, strerror(errno));
rc = ORTE_ERR_FILE_WRITE_FAILURE;
}
cleanup:
/* cleanup */
if (NULL != key.data) {
free(key.data);
}
if (NULL != data.data) {
free(data.data);
}
return rc;
}
static int set_recover_source(orte_process_name_t *name)
{
char *path, *pname;
int rc=ORTE_SUCCESS;
/* setup the database */
orte_util_convert_process_name_to_string(&pname, name);
path = opal_os_path(false, orte_state_db_directory, pname, NULL);
free(pname);
if (NULL == (recover_db = dbopen(path, O_RDONLY, S_IRWXU, DB_HASH, NULL))) {
orte_show_help("help-state-db.txt", "cannot-open-db", true, path);
free(path);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
free(path);
return rc;
}
static int recover(void *object, opal_data_type_t type)
{
DBT key, data;
opal_buffer_t buf;
orte_job_t *jdata;
orte_proc_t *proc;
char *name;
int rc=ORTE_SUCCESS;
int32_t n;
orte_job_state_t *jstate;
orte_proc_state_t *pstate;
if (NULL == recover_db) {
orte_show_help("help-state-db.txt", "recover-source-undef", true);
rc = ORTE_ERR_NOT_FOUND;
}
/* construct the buffer we will use for unpacking the data */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
key.data = NULL;
data.data = NULL;
switch (type) {
case ORTE_JOB:
jdata = (orte_job_t*)object;
asprintf((char**)&key.data, "JOB:%s", ORTE_JOBID_PRINT(jdata->jobid));
key.size = strlen(key.data);
break;
case ORTE_PROC:
proc = (orte_proc_t*)object;
orte_util_convert_process_name_to_string(&name, &proc->name);
asprintf((char**)&key.data, "PROC:%s", name);
free(name);
key.size = strlen(key.data);
break;
default:
orte_show_help("help-state-db.txt", "unrecognized-type", true, type);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
break;
}
/* get the specified data */
if (0 > recover_db->get(recover_db, &key, &data, 0)) {
orte_show_help("help-state-db.txt", "error-reading-db", true, (char*)key.data, strerror(errno));
rc = ORTE_ERR_FILE_READ_FAILURE;
goto cleanup;
}
/* populate the recovered info */
opal_dss.load(&buf, data.data, data.size);
switch (type) {
case ORTE_JOB:
n=1;
opal_dss.unpack(&buf, &jstate, &n, ORTE_JOB_STATE_T);
jdata->state = *jstate;
break;
case ORTE_PROC:
n=1;
opal_dss.unpack(&buf, &pstate, &n, ORTE_PROC_STATE_T);
proc->state = *pstate;
break;
default:
break;
}
cleanup:
if (NULL != key.data) {
free(key.data);
}
if (NULL != data.data) {
free(data.data);
}
OBJ_DESTRUCT(&buf);
return rc;
}

31
orte/mca/state/db/state_db.h Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_STATE_DB_H
#define ORTE_STATE_DB_H
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Module open / close
*/
int orte_state_db_component_open(void);
int orte_state_db_component_close(void);
int orte_state_db_component_query(mca_base_module_t **module, int *priority);
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_db_component;
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_db_module;
extern char *orte_state_db_directory;
END_C_DECLS
#endif /* ORTE_STATE_DB_H */

101
orte/mca/state/db/state_db_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,101 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "state_db.h"
extern orte_state_base_module_t orte_state_db_module;
char *orte_state_db_filename;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_state_base_component_t mca_state_db_component = {
{
ORTE_STATE_BASE_VERSION_1_0_0,
/* Component name and version */
"db",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_state_db_component_open,
orte_state_db_component_close,
orte_state_db_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
int
orte_state_db_component_open(void)
{
return ORTE_SUCCESS;
}
int orte_state_db_component_query(mca_base_module_t **module, int *priority)
{
/* we are the file module - we need to be selected
* IFF we are requested
*/
bool is_required = false;
mca_base_component_t *c = &mca_state_db_component.base_version;
/* retrieve the name of the file to be used */
mca_base_param_reg_string(c, "dir",
"Name of directory to be used for storing and recovering state information",
false, false, NULL, &orte_state_db_directory);
mca_base_is_component_required(&orte_state_base_components_available,
&mca_state_db_component.base_version,
true,
&is_required);
if( is_required || NULL != orte_state_db_directory) {
*priority = 1000;
*module = (mca_base_module_t*)&orte_state_db_module;
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
int
orte_state_db_component_close(void)
{
return ORTE_SUCCESS;
}

34
orte/mca/state/dbm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
state_dbm.h \
state_dbm_component.c \
state_dbm.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_state_dbm_DSO
component_noinst =
component_install = mca_state_dbm.la
else
component_noinst = libmca_state_dbm.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_state_dbm_la_SOURCES = $(sources)
mca_state_dbm_la_LDFLAGS = -module -avoid-version -ldbm
noinst_LTLIBRARIES = $(component_noinst)
libmca_state_dbm_la_SOURCES =$(sources)
libmca_state_dbm_la_LDFLAGS = -module -avoid-version -ldbm

16
orte/mca/state/dbm/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,16 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# MCA_state_dbm_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_state_dbm_CONFIG], [
# only build if ndbm.h is found
AC_CHECK_HEADERS([ndbm.h], [$1], [$2], [AC_INCLUDES_DEFAULT])
])dnl

11
orte/mca/state/dbm/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,11 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

244
orte/mca/state/dbm/state_dbm.c Обычный файл
Просмотреть файл

@ -0,0 +1,244 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#include <stdio.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ndbm.h>
#include "opal/dss/dss_types.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/os_path.h"
#include "opal/util/output.h"
#include "opal/util/malloc.h"
#include "opal/util/basename.h"
#include "opal/mca/pstat/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/mca/sysinfo/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "state_dbm.h"
static int init(void);
static int finalize(void);
static int save(void *object, opal_data_type_t type);
static int set_recover_source(orte_process_name_t *name);
static int recover(void *object, opal_data_type_t type);
orte_state_base_module_t orte_state_dbm_module = {
init,
finalize,
save,
set_recover_source,
recover
};
/* local variables */
static DBM *save_dbm=NULL, *recover_dbm=NULL;
static int init(void)
{
char *path, *name;
/* setup the database */
if (ORTE_SUCCESS != opal_os_dirpath_create(orte_state_dbm_directory, S_IRWXU)) {
orte_show_help("help-state-dbm.txt", "cannot-create-dir", true,
orte_state_dbm_directory);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
orte_util_convert_process_name_to_string(&name, ORTE_PROC_MY_NAME);
path = opal_os_path(false, orte_state_dbm_directory, name, NULL);
free(name);
if (NULL == (save_dbm = dbm_open(path, O_CREAT | O_RDWR | O_TRUNC, S_IRWXU))) {
orte_show_help("help-state-dbm.txt", "cannot-create-dbm", true, path);
free(path);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
free(path);
return ORTE_SUCCESS;
}
static int finalize(void)
{
/* if we are normally terminating, remove the recovery file */
return ORTE_SUCCESS;
}
static int save(void *object, opal_data_type_t type)
{
datum key, data;
opal_buffer_t buf;
orte_job_t *jdata;
orte_proc_t *proc;
char *name;
int rc=ORTE_SUCCESS, size;
/* construct the buffer we will use for packing the data */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
key.dptr = NULL;
data.dptr = NULL;
switch (type) {
case ORTE_JOB:
jdata = (orte_job_t*)object;
opal_dss.pack(&buf, &jdata->state, 1, ORTE_JOB_STATE_T);
asprintf((char**)&key.dptr, "JOB:%s", ORTE_JOBID_PRINT(jdata->jobid));
key.dsize = strlen(key.dptr);
break;
case ORTE_PROC:
proc = (orte_proc_t*)object;
opal_dss.pack(&buf, &proc->state, 1, ORTE_PROC_STATE_T);
orte_util_convert_process_name_to_string(&name, &proc->name);
asprintf((char**)&key.dptr, "PROC:%s", name);
free(name);
key.dsize = strlen(key.dptr);
break;
default:
orte_show_help("help-state-dbm.txt", "unrecognized-type", true, type);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
break;
}
/* unload the data */
opal_dss.unload(&buf, (void**)&data.dptr, &size);
data.dsize = size;
OBJ_DESTRUCT(&buf);
/* put the info into the dbm */
if (0 > dbm_store(save_dbm, key, data, DBM_REPLACE)) {
orte_show_help("help-state-dbm.txt", "error-writing-dbm", true, (char*)key.dptr, strerror(errno));
rc = ORTE_ERR_FILE_WRITE_FAILURE;
}
cleanup:
/* cleanup */
if (NULL != key.dptr) {
free(key.dptr);
}
if (NULL != data.dptr) {
free(data.dptr);
}
return rc;
}
static int set_recover_source(orte_process_name_t *name)
{
char *path, *pname;
int rc=ORTE_SUCCESS;
/* setup the database */
orte_util_convert_process_name_to_string(&pname, name);
path = opal_os_path(false, orte_state_dbm_directory, pname, NULL);
free(pname);
if (NULL == (recover_dbm = dbm_open(path, O_RDONLY, S_IRWXU))) {
orte_show_help("help-state-dbm.txt", "cannot-open-dbm", true, path);
free(path);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
free(path);
return rc;
}
static int recover(void *object, opal_data_type_t type)
{
datum key, data;
opal_buffer_t buf;
orte_job_t *jdata;
orte_proc_t *proc;
char *name;
int rc=ORTE_SUCCESS;
int32_t n;
orte_job_state_t *jstate;
orte_proc_state_t *pstate;
if (NULL == recover_dbm) {
orte_show_help("help-state-dbm.txt", "recover-source-undef", true);
rc = ORTE_ERR_NOT_FOUND;
}
/* construct the buffer we will use for unpacking the data */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
key.dptr = NULL;
data.dptr = NULL;
switch (type) {
case ORTE_JOB:
jdata = (orte_job_t*)object;
asprintf((char**)&key.dptr, "JOB:%s", ORTE_JOBID_PRINT(jdata->jobid));
key.dsize = strlen(key.dptr);
break;
case ORTE_PROC:
proc = (orte_proc_t*)object;
orte_util_convert_process_name_to_string(&name, &proc->name);
asprintf((char**)&key.dptr, "PROC:%s", name);
free(name);
key.dsize = strlen(key.dptr);
break;
default:
orte_show_help("help-state-dbm.txt", "unrecognized-type", true, type);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
break;
}
/* get the specified data */
data = dbm_fetch(recover_dbm, key);
if (NULL == data.dptr) {
orte_show_help("help-state-dbm.txt", "error-reading-dbm", true, (char*)key.dptr, strerror(errno));
rc = ORTE_ERR_FILE_READ_FAILURE;
goto cleanup;
}
/* populate the recovered info */
opal_dss.load(&buf, data.dptr, data.dsize);
switch (type) {
case ORTE_JOB:
n=1;
opal_dss.unpack(&buf, &jstate, &n, ORTE_JOB_STATE_T);
jdata->state = *jstate;
break;
case ORTE_PROC:
n=1;
opal_dss.unpack(&buf, &pstate, &n, ORTE_PROC_STATE_T);
proc->state = *pstate;
break;
default:
break;
}
cleanup:
if (NULL != key.dptr) {
free(key.dptr);
}
if (NULL != data.dptr) {
free(data.dptr);
}
OBJ_DESTRUCT(&buf);
return rc;
}

31
orte/mca/state/dbm/state_dbm.h Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_STATE_DBM_H
#define ORTE_STATE_DBM_H
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Module open / close
*/
int orte_state_dbm_component_open(void);
int orte_state_dbm_component_close(void);
int orte_state_dbm_component_query(mca_base_module_t **module, int *priority);
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dbm_component;
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dbm_module;
extern char *orte_state_dbm_directory;
END_C_DECLS
#endif /* ORTE_STATE_DBM_H */

101
orte/mca/state/dbm/state_dbm_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,101 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "state_dbm.h"
extern orte_state_base_module_t orte_state_dbm_module;
char *orte_state_dbm_filename;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_state_base_component_t mca_state_dbm_component = {
{
ORTE_STATE_BASE_VERSION_1_0_0,
/* Component name and version */
"dbm",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_state_dbm_component_open,
orte_state_dbm_component_close,
orte_state_dbm_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
int
orte_state_dbm_component_open(void)
{
return ORTE_SUCCESS;
}
int orte_state_dbm_component_query(mca_base_module_t **module, int *priority)
{
/* we are the file module - we need to be selected
* IFF we are requested
*/
bool is_required = false;
mca_base_component_t *c = &mca_state_dbm_component.base_version;
/* retrieve the name of the file to be used */
mca_base_param_reg_string(c, "dir",
"Name of directory to be used for storing and recovering state information",
false, false, NULL, &orte_state_dbm_directory);
mca_base_is_component_required(&orte_state_base_components_available,
&mca_state_dbm_component.base_version,
true,
&is_required);
if( is_required || NULL != orte_state_dbm_directory) {
*priority = 1000;
*module = (mca_base_module_t*)&orte_state_dbm_module;
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
int
orte_state_dbm_component_close(void)
{
return ORTE_SUCCESS;
}

91
orte/mca/state/state.h Обычный файл
Просмотреть файл

@ -0,0 +1,91 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The OpenRTE State Save/Recovery Service
*
*/
#ifndef ORTE_STATE_H
#define ORTE_STATE_H
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/dss/dss_types.h"
BEGIN_C_DECLS
/*
* API functions
*/
/*
* Initialize the module
*/
typedef int (*orte_state_base_module_init_fn_t)(void);
/*
* Finalize the module
*/
typedef int (*orte_state_base_module_finalize_fn_t)(void);
/*
* Save the state of the provided object
*/
typedef int (*orte_state_base_module_save_fn_t)(void *object, opal_data_type_t type);
/*
* Set the source for recovering state info
*/
typedef int (*orte_state_base_module_set_recover_source_fn_t)(orte_process_name_t *name);
/*
* Recover the state of an object
*/
typedef int (*orte_state_base_module_recover_fn_t)(void *object, opal_data_type_t type);
/*
* the standard module data structure
*/
struct orte_state_base_module_1_0_0_t {
orte_state_base_module_init_fn_t init;
orte_state_base_module_finalize_fn_t finalize;
orte_state_base_module_save_fn_t save;
orte_state_base_module_set_recover_source_fn_t set_recover_source;
orte_state_base_module_recover_fn_t recover;
};
typedef struct orte_state_base_module_1_0_0_t orte_state_base_module_1_0_0_t;
typedef struct orte_state_base_module_1_0_0_t orte_state_base_module_t;
/*
* the standard component data structure
*/
struct orte_state_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_state_base_component_1_0_0_t orte_state_base_component_1_0_0_t;
typedef struct orte_state_base_component_1_0_0_t orte_state_base_component_t;
/*
* Macro for use in components that are of type state
*/
#define ORTE_STATE_BASE_VERSION_1_0_0 \
MCA_BASE_VERSION_2_0_0, \
"state", 1, 0, 0
/* Global structure for accessing STATE functions */
ORTE_DECLSPEC extern orte_state_base_module_t orte_state; /* holds selected module's function pointers */
END_C_DECLS
#endif