1
1
openmpi/opal/mca/crs/self/crs_self_module.c
Nathan Hjelm 4d92c9989e more c99 updates
This commit does two things. It removes checks for C99 required
headers (stdlib.h, string.h, signal.h, etc). Additionally it removes
definitions for required C99 types (intptr_t, int64_t, int32_t, etc).

Signed-off-by: Nathan Hjelm <hjelmn@me.com>
2015-06-25 10:14:13 -06:00

756 строки
23 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#ifdef HAVE_DLFCN_H
#include <dlfcn.h>
#endif
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal_cr.h"
#include "crs_self.h"
/*
* Self module
*/
static opal_crs_base_module_t loc_module = {
/** Initialization Function */
opal_crs_self_module_init,
/** Finalization Function */
opal_crs_self_module_finalize,
/** Checkpoint interface */
opal_crs_self_checkpoint,
/** Restart Command Access */
opal_crs_self_restart,
/** Disable checkpoints */
opal_crs_self_disable_checkpoint,
/** Enable checkpoints */
opal_crs_self_enable_checkpoint,
/** Prelaunch */
opal_crs_self_prelaunch,
/** Register Thread */
opal_crs_self_reg_thread
};
/*
* Snapshot structure
*/
OBJ_CLASS_DECLARATION(opal_crs_self_snapshot_t);
struct opal_crs_self_snapshot_t {
/** Base CRS snapshot type */
opal_crs_base_snapshot_t super;
/** Command Line used to restart the app */
char * cmd_line;
};
typedef struct opal_crs_self_snapshot_t opal_crs_self_snapshot_t;
static void opal_crs_self_construct(opal_crs_self_snapshot_t *obj);
static void opal_crs_self_destruct( opal_crs_self_snapshot_t *obj);
OBJ_CLASS_INSTANCE(opal_crs_self_snapshot_t,
opal_crs_base_snapshot_t,
opal_crs_self_construct,
opal_crs_self_destruct);
typedef void (*opal_crs_self_dlsym_dummy_fn_t)(void);
/************************************
* Locally Global vars & functions :)
************************************/
static int crs_self_find_function(char *prefix, char *suffix,
opal_crs_self_dlsym_dummy_fn_t *fn_ptr);
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot);
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd);
static int self_cold_start(opal_crs_self_snapshot_t *snapshot);
void opal_crs_self_construct(opal_crs_self_snapshot_t *snapshot)
{
snapshot->cmd_line = NULL;
}
void opal_crs_self_destruct( opal_crs_self_snapshot_t *snapshot)
{
if(NULL != snapshot->cmd_line)
free(snapshot->cmd_line);
}
static int opal_crs_self_extract_callbacks(void);
/*
* MCA Functions
*/
int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
{
int ret;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: component_query()");
/*
* If this is a tool, then return a module with the lowest priority.
* This allows 'mpirun' to select the 'none' component since it has
* a priority higher than 0.
* But also allows 'opal-restart' to select this component if needed
* since it only ever requests that a specific component be opened
* that is defined in the snapshot metadata file.
*/
if( opal_cr_is_tool ) {
*priority = 0;
*module = (mca_base_module_t *)&loc_module;
return OPAL_SUCCESS;
}
/*
* Extract the user level callbacks if they exist
*/
ret = opal_crs_self_extract_callbacks();
if( OPAL_SUCCESS != ret ||
!mca_crs_self_component.can_checkpoint ) {
*priority = -1;
*module = NULL;
return OPAL_ERROR;
}
else {
*priority = mca_crs_self_component.super.priority;
*module = (mca_base_module_t *)&loc_module;
return OPAL_SUCCESS;
}
}
static int opal_crs_self_extract_callbacks(void)
{
opal_crs_self_dlsym_dummy_fn_t loc_fn;
/*
* Find the function names
*/
crs_self_find_function(mca_crs_self_component.prefix,
SUFFIX_CHECKPOINT,
&loc_fn);
mca_crs_self_component.ucb_checkpoint_fn = (opal_crs_self_checkpoint_callback_fn_t)loc_fn;
crs_self_find_function(mca_crs_self_component.prefix,
SUFFIX_CONTINUE,
&loc_fn);
mca_crs_self_component.ucb_continue_fn = (opal_crs_self_continue_callback_fn_t)loc_fn;
crs_self_find_function(mca_crs_self_component.prefix,
SUFFIX_RESTART,
&loc_fn);
mca_crs_self_component.ucb_restart_fn = (opal_crs_self_restart_callback_fn_t)loc_fn;
/*
* Sanity check
*/
mca_crs_self_component.can_checkpoint = true;
if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
mca_crs_self_component.can_checkpoint = false;
}
if(NULL == mca_crs_self_component.ucb_continue_fn) {
}
if(NULL == mca_crs_self_component.ucb_restart_fn) {
}
return OPAL_SUCCESS;
}
int opal_crs_self_module_init(void)
{
bool callback_matched = true;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: module_init()");
if( opal_cr_is_tool ) {
return OPAL_SUCCESS;
}
/*
* Sanity check
*/
if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
callback_matched = false;
mca_crs_self_component.can_checkpoint = false;
}
if(NULL == mca_crs_self_component.ucb_continue_fn) {
callback_matched = false;
}
if(NULL == mca_crs_self_component.ucb_restart_fn) {
callback_matched = false;
}
if( !callback_matched ) {
if( 1 <= mca_crs_self_component.super.verbose ) {
opal_show_help("help-opal-crs-self.txt", "self:no_callback", false,
"checkpoint", mca_crs_self_component.prefix, SUFFIX_CHECKPOINT,
"continue ", mca_crs_self_component.prefix, SUFFIX_CONTINUE,
"restart ", mca_crs_self_component.prefix, SUFFIX_RESTART,
PREFIX_DEFAULT);
}
}
/*
* If the user requested that we do_restart, then call their callback
*/
if(mca_crs_self_component.do_restart) {
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: module_init: Call their restart function");
if( NULL != mca_crs_self_component.ucb_restart_fn)
mca_crs_self_component.ucb_restart_fn();
}
return OPAL_SUCCESS;
}
int opal_crs_self_module_finalize(void)
{
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: module_finalize()");
return OPAL_SUCCESS;
}
int opal_crs_self_checkpoint(pid_t pid,
opal_crs_base_snapshot_t *base_snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state)
{
opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
int ret, exit_status = OPAL_SUCCESS;
char * restart_cmd = NULL;
/*
* This function should never be called by a tool
*/
if( opal_cr_is_tool ) {
return OPAL_ERR_NOT_SUPPORTED;
}
if( options->stop ) {
opal_output(0,
"crs:self: checkpoint(): Error: SIGSTOP Not currently supported!");
}
/*
* Setup for snapshot directory creation
*/
snapshot->super = *base_snapshot;
#if 0
snapshot->super.snapshot_directory = strdup(base_snapshot->snapshot_directory);
snapshot->super.metadata_filename = strdup(base_snapshot->metadata_filename);
#endif
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: checkpoint(%d, ---)", pid);
if(!mca_crs_self_component.can_checkpoint) {
opal_show_help("help-opal-crs-self.txt", "self:ckpt_disabled", false);
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* Update the snapshot metadata
*/
snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
/*
* Call the user callback function
*/
if(NULL != mca_crs_self_component.ucb_checkpoint_fn) {
mca_crs_self_component.ucb_checkpoint_fn(&restart_cmd);
}
/*
* Save the restart command
*/
if( NULL == restart_cmd) {
*state = OPAL_CRS_ERROR;
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
true);
exit_status = OPAL_ERROR;
goto cleanup;
}
else {
snapshot->cmd_line = strdup(restart_cmd);
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: checkpoint: Restart Command (%s)", snapshot->cmd_line);
}
/*
* The best we can do is update the metadata file with the
* application argv and argc we started with.
*/
if( OPAL_SUCCESS != (ret = self_update_snapshot_metadata(snapshot)) ) {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
snapshot->super.metadata_filename);
exit_status = ret;
goto cleanup;
}
*state = OPAL_CRS_CONTINUE;
/*
* Call their continue routine for completeness
*/
if(NULL != mca_crs_self_component.ucb_continue_fn) {
mca_crs_self_component.ucb_continue_fn();
}
base_snapshot = &(snapshot->super);
cleanup:
if( NULL != restart_cmd) {
free(restart_cmd);
restart_cmd = NULL;
}
return exit_status;
}
/*
* Notice that the user restart callback is not called here, but always from
* opal_init for the self module.
*/
int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
char **cr_argv = NULL;
char * cr_cmd = NULL;
int ret;
int exit_status = OPAL_SUCCESS;
int status;
snapshot->super = *base_snapshot;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: restart(%d)", spawn_child);
/*
* If we need to reconstruct the snapshot
*/
if(snapshot->super.cold_start) {
if( OPAL_SUCCESS != (ret = self_cold_start(snapshot)) ){
exit_status = ret;
opal_output(mca_crs_self_component.super.output_handle,
"crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
goto cleanup;
}
}
/*
* JJH: Check to make sure the application exists?
*/
/*
* Get the restart command
*/
if ( OPAL_SUCCESS != (ret = opal_crs_self_restart_cmd(snapshot, &cr_cmd)) ) {
exit_status = ret;
goto cleanup;
}
if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
exit_status = OPAL_ERROR;
goto cleanup;
}
if (!spawn_child) {
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: self_restart: SELF: exec :(%s, %s):",
strdup(cr_argv[0]),
opal_argv_join(cr_argv, ' '));
status = execvp(strdup(cr_argv[0]), cr_argv);
if(status < 0) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_restart: SELF: Child failed to execute :(%d):", status);
}
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_restart: SELF: execvp returned %d", status);
exit_status = status;
goto cleanup;
}
else {
*child_pid = fork();
if( *child_pid == 0) {
/* Child Process */
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: self_restart: CHILD: exec :(%s, %s):",
strdup(cr_argv[0]),
opal_argv_join(cr_argv, ' '));
status = execvp(strdup(cr_argv[0]), cr_argv);
if(status < 0) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_restart: CHILD: Child failed to execute :(%d):", status);
}
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_restart: CHILD: execvp returned %d", status);
exit_status = status;
goto cleanup;
}
else if(*child_pid > 0) {
/* Parent is done once it is started. */
;
}
else {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_restart: CHILD: fork failed :(%d):", *child_pid);
}
}
cleanup:
if( NULL != cr_cmd)
free(cr_cmd);
if( NULL != cr_argv)
opal_argv_free(cr_argv);
return exit_status;
}
int opal_crs_self_disable_checkpoint(void)
{
/*
* This function should never be called by a tool
*/
if( opal_cr_is_tool ) {
return OPAL_ERR_NOT_SUPPORTED;
}
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: disable_checkpoint()");
mca_crs_self_component.can_checkpoint = false;
return OPAL_SUCCESS;
}
int opal_crs_self_enable_checkpoint(void)
{
/*
* This function should never be called by a tool
*/
if( opal_cr_is_tool ) {
return OPAL_ERR_NOT_SUPPORTED;
}
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: enable_checkpoint()");
mca_crs_self_component.can_checkpoint = true;
return OPAL_SUCCESS;
}
int opal_crs_self_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
/*
* This function should never be called by a tool
*/
if( opal_cr_is_tool ) {
return OPAL_ERR_NOT_SUPPORTED;
}
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_self_reg_thread(void)
{
/*
* This function should never be called by a tool
*/
if( opal_cr_is_tool ) {
return OPAL_ERR_NOT_SUPPORTED;
}
return OPAL_SUCCESS;
}
/******************
* Local functions
******************/
static int crs_self_find_function(char *prefix, char *suffix,
opal_crs_self_dlsym_dummy_fn_t *fn_ptr) {
char *func_to_find = NULL;
if( NULL == prefix || 0 >= strlen(prefix) ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: crs_self_find_function: Error: prefix is NULL or empty string!");
*fn_ptr = NULL;
return OPAL_ERROR;
}
if( NULL == suffix || 0 >= strlen(suffix) ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: crs_self_find_function: Error: suffix is NULL or empty string!");
*fn_ptr = NULL;
return OPAL_ERROR;
}
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: crs_self_find_function(--, %s, %s)",
prefix, suffix);
asprintf(&func_to_find, "%s_%s", prefix, suffix);
/* The RTLD_DEFAULT is a special handle that searches the default libraries
* including the current application for the indicated symbol. This allows
* us to not have to dlopen/dlclose the executable. A bit of short hand
* really.
*/
*((void**) fn_ptr) = dlsym(RTLD_DEFAULT, func_to_find);
if( NULL == fn_ptr) {
opal_output_verbose(12, mca_crs_self_component.super.output_handle,
"crs:self: crs_self_find_function: WARNING: Function \"%s\" not found",
func_to_find);
}
else {
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: crs_self_find_function: Found function \"%s\"",
func_to_find);
}
if( NULL == func_to_find) {
free(func_to_find);
}
return OPAL_SUCCESS;
}
/*
* Self is a special case. The 'fname' here is the command line that the user
* wishes to execute. This function takes this command line and adds
* -mca crs_self_do_restart 1
* Which will trigger the restart callback once the program has been run.
*
* For example, The user starts their program with:
* $ my_prog arg1 arg2
*
* They checkpoint it:
* $ opal_checkpoint -mca crs self 1234
*
* They restart it:
* $ opal_restart -mca crs self my_prog arg1 arg2
*
* fname is then:
* fname = "my_prog arg1 arg2"
*
* This funciton translates that to the command:
* cmd = "my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1"
*
* Which will cause the program "my_prog" to call their restart function
* upon opal_init time.
*
* Note: The user could bypass the opal_restart routine safely by simply calling
* $ my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1
* However, for consistency sake, we should not encourage this as it won't work for
* all of the other checkpointers.
*/
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
{
char * tmp_env_var = NULL;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
(void) mca_base_var_env_name("crs", &tmp_env_var);
opal_setenv(tmp_env_var,
"self",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
(void) mca_base_var_env_name("crs_self_do_restart", &tmp_env_var);
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
(void) mca_base_var_env_name("crs_self_prefix", &tmp_env_var);
opal_setenv(tmp_env_var,
mca_crs_self_component.prefix,
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* Instead of adding it to the command line, we should use the environment
* to pass the values. This allow sthe OPAL application to be braindead
* WRT MCA parameters
* add_args = strdup("-mca crs self -mca crs_self_do_restart 1");
*/
asprintf(cmd, "%s", snapshot->cmd_line);
return OPAL_SUCCESS;
}
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
int ret, exit_status = OPAL_SUCCESS;
char **tmp_argv = NULL;
char * component_name = NULL;
int prev_pid;
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: cold_start()");
/*
* Find the snapshot directory, read the metadata file
*/
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
&component_name, &prev_pid) ) ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
snapshot->super.metadata_filename, ret);
exit_status = ret;
goto cleanup;
}
snapshot->super.component_name = strdup(component_name);
/* Compare the strings to make sure this is our snapshot before going further */
if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name,
component_name, strlen(component_name)) ) {
exit_status = OPAL_ERROR;
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
component_name, mca_crs_self_component.super.base_version.mca_component_name);
goto cleanup;
}
/*
* Restart command
* JJH: Command lines limited to 256 chars.
*/
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
if( NULL == tmp_argv ) {
opal_output(mca_crs_self_component.super.output_handle,
"crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
exit_status = OPAL_ERROR;
goto cleanup;
}
asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);
/*
* Reset the cold_start flag
*/
snapshot->super.cold_start = false;
cleanup:
if(NULL != tmp_argv) {
opal_argv_free(tmp_argv);
tmp_argv = NULL;
}
return exit_status;
}
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
int exit_status = OPAL_SUCCESS;
if(NULL == snapshot->cmd_line) {
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
true);
exit_status = OPAL_ERROR;
goto cleanup;
}
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
"crs:self: update_snapshot_metadata(%s)",
snapshot->super.metadata_filename);
/*
* Append to the metadata file the command line to restart with
* - How user wants us to restart
*/
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->cmd_line);
cleanup:
return exit_status;
}