1
1

This commit fixes runs when there is no available CRS component (BLCR is unavailable, and SELF is deactivated). Previously the run would fail out of MPI_INIT since the OPAL CRS framework could not select a component. This is because the framework did not recognize the 'none' component as a full component because it was part of crs/base.

I promoted the ''none'' component to a full component, and updated the other components to reflect this code movement. The ''none'' component is the default component unless the user requests '''-am ft-enable-cr''' to auto-select a component. There is an MCA parameter to show a warning if the application requested an FT enabled job, but the ''none'' component was selected ({{{crs_none_select_warning}}}).

This temporarily fixes the problem mentioned in r18739. The full fix will entail working on ticket #1291.

Thanks to Ethan from Sun for finding this bug.

This commit was SVN r18840.

The following SVN revision numbers were found above:
  r18739 --> open-mpi/ompi@a003fa7a50
Этот коммит содержится в:
Josh Hursey 2008-07-08 20:04:39 +00:00
родитель 9f0280bd55
Коммит c4035d848f
14 изменённых файлов: 453 добавлений и 212 удалений

Просмотреть файл

@ -81,33 +81,6 @@ extern "C" {
OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;
OPAL_DECLSPEC extern char * opal_crs_base_snapshot_dir;
/**
* 'None' component functions
* These are to be used when no component is selected.
* They just return success, and empty strings as necessary.
*/
int opal_crs_base_none_open(void);
int opal_crs_base_none_close(void);
int opal_crs_base_none_query(mca_base_module_t **module, int *priority);
int opal_crs_base_none_module_init(void);
int opal_crs_base_none_module_finalize(void);
int opal_crs_base_none_checkpoint( pid_t pid, opal_crs_base_snapshot_t *sanpshot, opal_crs_state_type_t *state);
int opal_crs_base_none_restart( opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid);
int opal_crs_base_none_disable_checkpoint(void);
int opal_crs_base_none_enable_checkpoint(void);
OPAL_DECLSPEC int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
OPAL_DECLSPEC int opal_crs_base_none_reg_thread(void);
/**
* Some utility functions
*/

Просмотреть файл

@ -88,99 +88,7 @@ OBJ_CLASS_INSTANCE(opal_crs_base_snapshot_t,
opal_crs_base_construct,
opal_crs_base_destruct);
int opal_crs_base_none_open(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_close(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_query(mca_base_module_t **module, int *priority)
{
*module = NULL;
*priority = 0;
return OPAL_SUCCESS;
}
int opal_crs_base_none_module_init(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_module_finalize(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
{
int ret;
*state = OPAL_CRS_CONTINUE;
snapshot->component_name = strdup("none");
snapshot->reference_name = strdup("none");
snapshot->local_location = strdup("");
snapshot->remote_location = strdup("");
snapshot->cold_start = false;
/*
* Update the snapshot metadata
*/
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, "none") ) ) {
opal_output(0,
"crs:none: checkpoint(): Error: Unable to write component name to the directory for (%s).",
snapshot->reference_name);
return ret;
}
return OPAL_SUCCESS;
}
int opal_crs_base_none_restart(opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid)
{
*child_pid = getpid();
return OPAL_SUCCESS;
}
int opal_crs_base_none_disable_checkpoint(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_enable_checkpoint(void)
{
return OPAL_SUCCESS;
}
int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_base_none_reg_thread(void)
{
return OPAL_SUCCESS;
}
/*
* Utility functions

Просмотреть файл

@ -85,7 +85,7 @@ int opal_crs_base_open(void)
mca_base_param_reg_string_name("crs", NULL,
"Which CRS component to use (empty = auto-select)",
false, false,
NULL, &str_value);
"none", &str_value);
/* Open up all available components */
if (OPAL_SUCCESS != (ret = mca_base_components_open("crs",

Просмотреть файл

@ -31,72 +31,11 @@
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal_cr.h"
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
static opal_crs_base_component_t none_component = {
/* Handle the general mca_component_t struct containing
* meta information about the component itself
*/
{
OPAL_CRS_BASE_VERSION_1_0_0,
/* Component name and version */
"none",
OMPI_MAJOR_VERSION,
OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION,
/* Component open and close functions */
opal_crs_base_none_open,
opal_crs_base_none_close,
opal_crs_base_none_query
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
};
static opal_crs_base_module_t none_module = {
/** Initialization Function */
opal_crs_base_none_module_init,
/** Finalization Function */
opal_crs_base_none_module_finalize,
/** Checkpoint interface */
opal_crs_base_none_checkpoint,
/** Restart Command Access */
opal_crs_base_none_restart,
/** Disable checkpoints */
opal_crs_base_none_disable_checkpoint,
/** Enable checkpoints */
opal_crs_base_none_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch,
/** Register thread */
opal_crs_base_none_reg_thread
};
int opal_crs_base_select(void)
{
int ret, exit_status = OPAL_SUCCESS;
opal_crs_base_component_t *best_component = NULL;
opal_crs_base_module_t *best_module = NULL;
char *include_list = NULL;
int int_value = 0;
/*
@ -122,28 +61,6 @@ int opal_crs_base_select(void)
return OPAL_SUCCESS;
}
/*
* Register the framework MCA param and look up include list
*/
mca_base_param_reg_string_name("crs", NULL,
"Which CRS component to use (empty = auto-select)",
false, false,
NULL, &include_list);
if(NULL != include_list && 0 == strncmp(include_list, "none", strlen("none")) ){
opal_output_verbose(10, opal_crs_base_output,
"crs:select: Using %s component",
include_list);
best_component = &none_component;
best_module = &none_module;
/* JJH: Todo: Check if none is in the list */
/* Close all components since none will be used */
mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */
&opal_crs_base_components_available,
NULL);
goto skip_select;
}
/*
* Select the best component
*/
@ -156,7 +73,6 @@ int opal_crs_base_select(void)
goto cleanup;
}
skip_select:
/* Save the winner */
opal_crs_base_selected_component = *best_component;
opal_crs = *best_module;
@ -170,10 +86,5 @@ int opal_crs_base_select(void)
}
cleanup:
if( NULL != include_list ) {
free(include_list);
include_list = NULL;
}
return exit_status;
}

Просмотреть файл

@ -71,6 +71,13 @@ extern "C" {
int opal_crs_blcr_disable_checkpoint(void);
int opal_crs_blcr_enable_checkpoint(void);
int opal_crs_blcr_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_blcr_reg_thread(void);
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -64,7 +64,7 @@ static opal_crs_base_module_t blcr_module = {
opal_crs_blcr_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch,
opal_crs_blcr_prelaunch,
/** Register Thread */
opal_crs_blcr_reg_thread
@ -203,6 +203,25 @@ int opal_crs_blcr_module_init(void)
return OPAL_SUCCESS;
}
int opal_crs_blcr_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_blcr_reg_thread(void)
{
cr_client_id_t loc_client_id;

40
opal/mca/crs/none/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2004-2008 The Trustees of Indiana University.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = \
$(LTDLINCL)
dist_pkgdata_DATA = help-opal-crs-none.txt
sources = \
crs_none.h \
crs_none_component.c \
crs_none_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_crs_none_DSO
component_noinst =
component_install = mca_crs_none.la
else
component_noinst = libmca_crs_none.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_crs_none_la_SOURCES = $(sources)
mca_crs_none_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_crs_none_la_SOURCES = $(sources)
libmca_crs_none_la_LDFLAGS = -module -avoid-version

13
opal/mca/crs/none/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,13 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2008 The Trustees of Indiana University.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_INIT_FILE=crs_none_component.c
PARAM_CONFIG_FILES="Makefile"

76
opal/mca/crs/none/crs_none.h Обычный файл
Просмотреть файл

@ -0,0 +1,76 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* NONE CRS component
*
* Simple, braindead implementation.
*/
#ifndef MCA_CRS_NONE_EXPORT_H
#define MCA_CRS_NONE_EXPORT_H
#include "opal_config.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/crs/crs.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Local Component structures
*/
struct opal_crs_none_component_t {
opal_crs_base_component_t super; /** Base CRS component */
};
typedef struct opal_crs_none_component_t opal_crs_none_component_t;
OPAL_MODULE_DECLSPEC extern opal_crs_none_component_t mca_crs_none_component;
int opal_crs_none_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions
*/
int opal_crs_none_module_init(void);
int opal_crs_none_module_finalize(void);
/*
* Actual funcationality
*/
int opal_crs_none_checkpoint( pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state);
int opal_crs_none_restart( opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid);
int opal_crs_none_disable_checkpoint(void);
int opal_crs_none_enable_checkpoint(void);
int opal_crs_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_none_reg_thread(void);
extern bool opal_crs_none_select_warning;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_CRS_NONE_EXPORT_H */

126
opal/mca/crs/none/crs_none_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,126 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_none.h"
/*
* Public string for version number
*/
const char *opal_crs_none_component_version_string =
"OPAL CRS none MCA component version " OMPI_VERSION;
/*
* Local functionality
*/
static int crs_none_open(void);
static int crs_none_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
opal_crs_none_component_t mca_crs_none_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itnone
*/
{
OPAL_CRS_BASE_VERSION_1_0_0,
/* Component name and version */
"none",
OMPI_MAJOR_VERSION,
OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION,
/* Component open and close functions */
crs_none_open,
crs_none_close,
opal_crs_none_component_query
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
}
};
/*
* None module
*/
static opal_crs_base_module_t loc_module = {
/** Initialization Function */
opal_crs_none_module_init,
/** Finalization Function */
opal_crs_none_module_finalize,
/** Checkpoint interface */
opal_crs_none_checkpoint,
/** Restart Command Access */
opal_crs_none_restart,
/** Disable checkpoints */
opal_crs_none_disable_checkpoint,
/** Enable checkpoints */
opal_crs_none_enable_checkpoint,
/** Prelaunch */
opal_crs_none_prelaunch,
/** Register Thread */
opal_crs_none_reg_thread
};
bool opal_crs_none_select_warning = false;
static int crs_none_open(void)
{
int value = 0;
mca_base_param_reg_int(&mca_crs_none_component.super.base_version,
"select_warning",
"Enable warning when the 'none' component is selected when checkpoint/restart functionality is requested."
"[Default = disabled/no-warning]",
false, false,
0, /* Disabled */
&value);
opal_crs_none_select_warning = OPAL_INT_TO_BOOL(value);
return OPAL_SUCCESS;
}
static int crs_none_close(void)
{
return OPAL_SUCCESS;
}
int opal_crs_none_component_query(mca_base_module_t **module, int *priority)
{
*module = (mca_base_module_t *)&loc_module;
*priority = mca_crs_none_component.super.priority;
return OPAL_SUCCESS;
}

116
opal/mca/crs/none/crs_none_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,116 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sys/types.h>
#if HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal_cr.h"
#include "crs_none.h"
int opal_crs_none_module_init(void)
{
/*
* If not a tool, and requesting C/R support print a warning.
*/
if( opal_crs_none_select_warning &&
!opal_cr_is_tool && opal_cr_is_enabled ) {
opal_show_help("help-opal-crs-none.txt", "none:select-warning",
true);
}
return OPAL_SUCCESS;
}
int opal_crs_none_module_finalize(void)
{
return OPAL_SUCCESS;
}
int opal_crs_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
{
int ret;
*state = OPAL_CRS_CONTINUE;
snapshot->component_name = strdup("none");
snapshot->reference_name = strdup("none");
snapshot->local_location = strdup("");
snapshot->remote_location = strdup("");
snapshot->cold_start = false;
/*
* Update the snapshot metadata
*/
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, "none") ) ) {
opal_output(0,
"crs:none: checkpoint(): Error: Unable to write component name to the directory for (%s).",
snapshot->reference_name);
return ret;
}
return OPAL_SUCCESS;
}
int opal_crs_none_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
*child_pid = getpid();
return OPAL_SUCCESS;
}
int opal_crs_none_disable_checkpoint(void)
{
return OPAL_SUCCESS;
}
int opal_crs_none_enable_checkpoint(void)
{
return OPAL_SUCCESS;
}
int opal_crs_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_none_reg_thread(void)
{
return OPAL_SUCCESS;
}

20
opal/mca/crs/none/help-opal-crs-none.txt Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
-*- text -*-
#
# Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open PAL CRS framework.
#
[none:select-warning]
Error: 'none' component selected.
Checkpoint/Restart functionality may not work properly.
Make sure that you have configured with and are using a fully functional
CRS component.
To disable this warning set the following MCA parmeter:
--mca crs_none_select_warning 0

Просмотреть файл

@ -81,6 +81,15 @@ extern "C" {
int opal_crs_self_disable_checkpoint(void);
int opal_crs_self_enable_checkpoint(void);
int opal_crs_self_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_self_reg_thread(void);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -63,10 +63,10 @@ static opal_crs_base_module_t loc_module = {
opal_crs_self_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch,
opal_crs_self_prelaunch,
/** Register Thread */
opal_crs_base_none_reg_thread
opal_crs_self_reg_thread
};
/*
@ -479,6 +479,29 @@ int opal_crs_self_enable_checkpoint(void)
return OPAL_SUCCESS;
}
int opal_crs_self_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_self_reg_thread(void)
{
return OPAL_SUCCESS;
}
/******************
* Local functions
******************/