Fix the 'self' CRS component.
Due to the visibility patch to libltdl in r21731, this module can no longer access or use the libltdl interfaces directly. Instead just use the dlopen/dlsym/dlclose functions directly. This is a portability implication here, but for the moment it does not seem to bite us. Also in this patch, cleanup some of the 'self' specific code paths. * opal-restart need not special case the 'self' component since it can now interact with it as if it were a normal component. * Cleanup the initialization of the cmd line arguments in opal-restart. * Make sure to mark opal-restart as a 'tool', but do so by setting the global variable directly instead of setting the environment variable, which could be inherited by the application. * Most of the functions in the 'self' component should not be used by a command line tool (exception being 'restart'), so make sure that if we accidently call them then errors are returned. * Increase the priority of the 'none' component to be above that of 'self' when being selected in a command line tool. This allows for both mpirun and opal-restart to work correctly with the 'self' module. This commit was SVN r21766. The following SVN revision numbers were found above: r21731 --> open-mpi/ompi@0278b86456
Этот коммит содержится в:
родитель
cf8bd2142a
Коммит
91e52d062b
@ -61,7 +61,7 @@ opal_crs_none_component_t mca_crs_none_component = {
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
0
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -27,8 +27,9 @@
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/libltdl/ltdl.h"
|
||||
#ifdef HAVE_DLFCN_H
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -97,8 +98,7 @@ OBJ_CLASS_INSTANCE(opal_crs_self_snapshot_t,
|
||||
/************************************
|
||||
* Locally Global vars & functions :)
|
||||
************************************/
|
||||
static lt_ptr
|
||||
crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix);
|
||||
static void * crs_self_find_function(void * handle, char *prefix, char *suffix);
|
||||
|
||||
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot);
|
||||
|
||||
@ -128,6 +128,20 @@ int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
|
||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||
"crs:self: component_query()");
|
||||
|
||||
/*
|
||||
* If this is a tool, then return a module with the lowest priority.
|
||||
* This allows 'mpirun' to select the 'none' component since it has
|
||||
* a priority higher than 0.
|
||||
* But also allows 'opal-restart' to select this component if needed
|
||||
* since it only ever requests that a specific component be opened
|
||||
* that is defined in the snapshot metadata file.
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&loc_module;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract the user level callbacks if they exist
|
||||
*/
|
||||
@ -149,16 +163,12 @@ int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
|
||||
static int opal_crs_self_extract_callbacks(void)
|
||||
{
|
||||
bool callback_matched = true;
|
||||
lt_dlhandle executable;
|
||||
void * executable = NULL;
|
||||
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the executable so that we can lookup the necessary symbols
|
||||
*/
|
||||
executable = lt_dlopen(NULL);
|
||||
executable = dlopen(NULL, RTLD_LOCAL|RTLD_LAZY);
|
||||
if ( NULL == executable) {
|
||||
opal_show_help("help-opal-crs-self.txt", "self:lt_dlopen",
|
||||
true);
|
||||
@ -186,7 +196,7 @@ static int opal_crs_self_extract_callbacks(void)
|
||||
/*
|
||||
* Done with executable, close it
|
||||
*/
|
||||
lt_dlclose(executable);
|
||||
dlclose(executable);
|
||||
|
||||
/*
|
||||
* Sanity check
|
||||
@ -269,6 +279,13 @@ int opal_crs_self_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
int ret, exit_status = OPAL_SUCCESS;
|
||||
char * restart_cmd = NULL;
|
||||
|
||||
/*
|
||||
* This function should never be called by a tool
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup for snapshot directory creation
|
||||
*/
|
||||
@ -393,6 +410,10 @@ int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_ch
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* JJH: Check to make sure the application exists?
|
||||
*/
|
||||
|
||||
/*
|
||||
* Get the restart command
|
||||
*/
|
||||
@ -464,6 +485,13 @@ int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_ch
|
||||
|
||||
int opal_crs_self_disable_checkpoint(void)
|
||||
{
|
||||
/*
|
||||
* This function should never be called by a tool
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||
"crs:self: disable_checkpoint()");
|
||||
|
||||
@ -474,6 +502,13 @@ int opal_crs_self_disable_checkpoint(void)
|
||||
|
||||
int opal_crs_self_enable_checkpoint(void)
|
||||
{
|
||||
/*
|
||||
* This function should never be called by a tool
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||
"crs:self: enable_checkpoint()");
|
||||
|
||||
@ -491,6 +526,13 @@ int opal_crs_self_prelaunch(int32_t rank,
|
||||
{
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* This function should never be called by a tool
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||
opal_setenv(tmp_env_var,
|
||||
"0", true, env);
|
||||
@ -502,16 +544,22 @@ int opal_crs_self_prelaunch(int32_t rank,
|
||||
|
||||
int opal_crs_self_reg_thread(void)
|
||||
{
|
||||
/*
|
||||
* This function should never be called by a tool
|
||||
*/
|
||||
if( opal_cr_is_tool ) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/******************
|
||||
* Local functions
|
||||
******************/
|
||||
static lt_ptr
|
||||
crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix){
|
||||
static void * crs_self_find_function(void * handle, char *prefix, char *suffix){
|
||||
char *func_to_find = NULL;
|
||||
lt_ptr ptr;
|
||||
void * ptr = NULL;
|
||||
|
||||
if( NULL == prefix || 0 >= strlen(prefix) ) {
|
||||
opal_output(mca_crs_self_component.super.output_handle,
|
||||
@ -530,7 +578,7 @@ crs_self_find_function(lt_dlhandle handle, char *prefix, char *suffix){
|
||||
|
||||
asprintf(&func_to_find, "%s_%s", prefix, suffix);
|
||||
|
||||
ptr = lt_dlsym(handle, func_to_find);
|
||||
ptr = dlsym(handle, func_to_find);
|
||||
if( NULL == ptr) {
|
||||
opal_output_verbose(12, mca_crs_self_component.super.output_handle,
|
||||
"crs:self: crs_self_find_function: WARNING: Function \"%s\" not found",
|
||||
|
@ -20,7 +20,7 @@
|
||||
#
|
||||
[self:lt_dlopen]
|
||||
Error: We were unable to open the executable with libtool.
|
||||
We needed to look for the inclusion of user defined callbacks.
|
||||
We needed to look for the inclusion of user defined callbacks.
|
||||
|
||||
Make sure that the linker has exported all of the symbols.
|
||||
This is usally enabled with a flag such as '-export'.
|
||||
|
@ -89,7 +89,6 @@ typedef struct {
|
||||
char *filename;
|
||||
bool verbose;
|
||||
bool forked;
|
||||
bool self_case;
|
||||
char *snapshot_loc;
|
||||
int output;
|
||||
} opal_restart_globals_t;
|
||||
@ -124,19 +123,6 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
"detected, however if a custom location was specified to opal-checkpoint "
|
||||
"then this argument is meant to match it."},
|
||||
|
||||
/*
|
||||
* We do this instead of using the '-mca crs self' convention as to not
|
||||
* influence the user into thinking that they need to do this for all of the
|
||||
* checkpointers. And to reinforce that the 'self' module is an exception, and
|
||||
* all other modules are automaticly detected.
|
||||
*/
|
||||
{ NULL, NULL, NULL,
|
||||
's', NULL, "self",
|
||||
0,
|
||||
&opal_restart_globals.self_case, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Is this a restart using the 'self' module. This is a special case as all "
|
||||
"other modules are automaticly detected" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL,
|
||||
'\0', NULL, NULL,
|
||||
@ -252,9 +238,6 @@ main(int argc, char *argv[])
|
||||
"\t Exec in self");
|
||||
}
|
||||
|
||||
/* JJH: Do not unsetenv(opal_cr_is_tool) here, as it will impact the
|
||||
* JJH: application improperly. */
|
||||
|
||||
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
||||
snapshot->cold_start = true;
|
||||
snapshot->reference_name = strdup(opal_restart_globals.filename);
|
||||
@ -363,6 +346,10 @@ static int initialize(int argc, char *argv[])
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark this process as a tool
|
||||
*/
|
||||
opal_cr_is_tool = true;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
@ -384,9 +371,13 @@ static int parse_args(int argc, char *argv[])
|
||||
int i, ret, len;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
opal_restart_globals_t tmp = { false, NULL, false, false, false, NULL, 0 };
|
||||
|
||||
opal_restart_globals = tmp;
|
||||
opal_restart_globals.help = false;
|
||||
opal_restart_globals.filename = NULL;
|
||||
opal_restart_globals.verbose = false;
|
||||
opal_restart_globals.forked = false;
|
||||
opal_restart_globals.snapshot_loc = NULL;
|
||||
opal_restart_globals.output = 0;
|
||||
|
||||
/* Parse the command line options */
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||
@ -410,9 +401,6 @@ static int parse_args(int argc, char *argv[])
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
/* JJH: Do not setenv(opal_cr_is_tool, 1) here, as it will impact the
|
||||
* JJH: application improperly. */
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
@ -453,21 +441,6 @@ static int parse_args(int argc, char *argv[])
|
||||
if(argc > 1) {
|
||||
opal_restart_globals.filename = strdup(opal_argv_join(argv, ' '));
|
||||
}
|
||||
|
||||
/*
|
||||
* Due to the special nature of the 'self' module, we need to know if that is the
|
||||
* requested module.
|
||||
* If it is then we don't need to do the 'snapshot reference' handling,
|
||||
* If it is NOT then we need to extract the requested CRS module from the
|
||||
* metadata file, and use that.
|
||||
* Should note that the 'self' module is the only one that needs to be specified
|
||||
* to opal_restart, all others are detected dynamicly.
|
||||
*/
|
||||
if( opal_restart_globals.self_case ) {
|
||||
/* They are not required to explicitly use the '-mca crs self' convention,
|
||||
* so set the environment var for them */
|
||||
expected_crs_comp = strdup("self");
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -486,24 +459,10 @@ static int check_file(char *given_filename)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* If this is the self case then we need to check that the application
|
||||
* exists
|
||||
/*
|
||||
* Check for the existance of the snapshot handle in the snapshot directory
|
||||
*/
|
||||
if(opal_restart_globals.self_case) {
|
||||
if( NULL == (argv = opal_argv_split(given_filename, ' ')) ) {
|
||||
exit_status = OPAL_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Extract just the application name */
|
||||
path_to_check = strdup(argv[0]);
|
||||
}
|
||||
/* Otherwise we are checking for the existance of the snapshot handle
|
||||
* in the snapshot directory
|
||||
*/
|
||||
else {
|
||||
path_to_check = opal_crs_base_get_snapshot_directory(given_filename);
|
||||
}
|
||||
path_to_check = opal_crs_base_get_snapshot_directory(given_filename);
|
||||
|
||||
/* Do the check */
|
||||
opal_output_verbose(10, opal_restart_globals.output,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user