1
1

Fix a SIGPIPE that may occur when checkpointing a restarted process. This was a result of calling system() in the BLCR CRS. After inspection and testing it was determined that the operation was no longer necessary. So the call was removed thus fixing the bug.

This commit was SVN r19601.
Этот коммит содержится в:
Josh Hursey 2008-09-22 16:49:56 +00:00
родитель 8eccda391a
Коммит 0cd65bfaa8
5 изменённых файлов: 115 добавлений и 53 удалений

Просмотреть файл

@ -93,7 +93,6 @@ OBJ_CLASS_INSTANCE(opal_crs_blcr_snapshot_t,
/******************
* Local Functions
******************/
static int blcr_chmod(char *dest);
static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname);
static int blcr_get_checkpoint_filename(char **fname, pid_t pid);
static int opal_crs_blcr_thread_callback(void *arg);
@ -273,7 +272,6 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
{
int ret, exit_status = OPAL_SUCCESS;
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
char * tmp_str = NULL;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(%d, ---)", pid);
@ -355,23 +353,6 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
}
if(*state == OPAL_CRS_CONTINUE) {
/*
* Make sure the snapshot is in the proper directory
* Update the snapshot structure
*/
asprintf(&tmp_str, "%s/%s", snapshot->super.local_location, snapshot->context_filename);
if (0 != (ret = blcr_chmod(tmp_str))) {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to chmod the checkpoint file (%s in the directory (%s) :[%d].",
snapshot->context_filename, tmp_str, ret);
perror("crs:blcr: checkpoint");
free(tmp_str);
exit_status = ret;
goto cleanup;
}
/*
* Update the metadata file
*/
@ -391,11 +372,6 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
base_snapshot = &(snapshot->super);
cleanup:
if(NULL != tmp_str) {
free(tmp_str);
tmp_str = NULL;
}
return exit_status;
}
@ -860,28 +836,3 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
return exit_status;
}
/*
* Change permissions on the file so we can read it
*/
static int blcr_chmod(char *dest) {
char * command = NULL;
int ret = OPAL_SUCCESS;
/* JJH: Assume 'chmod' is in the path */
asprintf(&command, "chmod u+rwX %s", dest);
if (0 != (ret = system(command) )) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: move(): Error: Unable to execute the command <%s> :[%d].",
command, ret);
perror("crs:blcr chmod");
goto error;
}
error:
if( NULL != command ) {
free(command);
}
return ret;
}

Просмотреть файл

@ -270,7 +270,7 @@ int opal_cr_init(void )
"opal_cr: init: Debug SIGPIPE: %d (%s)",
val, (opal_cr_debug_sigpipe ? "True" : "False"));
if( opal_cr_debug_sigpipe ) {
if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
;
}
@ -727,6 +727,12 @@ static void* opal_cr_thread_fn(opal_object_t *obj)
return NULL;
}
if( opal_cr_debug_sigpipe ) {
if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
;
}
}
/*
* Register this thread with the OPAL CRS
*/

Просмотреть файл

@ -102,6 +102,8 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
int orte_snapc_full_setup_job(orte_jobid_t jobid);
int orte_snapc_full_release_job(orte_jobid_t jobid);
int orte_snapc_full_ft_event(int state);
/*
* Global Coordinator Functionality
*/
@ -132,6 +134,7 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
*/
int app_coord_init(void);
int app_coord_finalize(void);
int app_coord_ft_event(int state);
END_C_DECLS

Просмотреть файл

@ -34,7 +34,7 @@
#include <signal.h>
#endif
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/orte_cr.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/mca.h"
@ -198,13 +198,15 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
* - Init the checkpoint metadata file
*/
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"App) notify_response: Start checkpoint..."));
"App) notify_response: Init checkpoint directory..."));
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(local_snapshot) ) ) {
opal_output(0, "App) Error: Unable to initalize the snapshot directory!\n");
exit_status = ret;
goto ckpt_cleanup;
}
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"App) notify_response: Start checkpoint..."));
STAGE_1:
opal_cr_currently_stalled = false;
@ -236,6 +238,9 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
goto ckpt_cleanup;
}
else if(cr_state == OPAL_CRS_CONTINUE) {
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"App) notify_response: Continuing...(%d)\n",
getpid()));
; /* Don't need to do anything here */
}
else if(cr_state == OPAL_CRS_TERM ) {
@ -264,6 +269,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
close(app_comm_pipe_r_fd);
remove(app_comm_pipe_r);
remove(app_comm_pipe_w);
app_comm_pipe_r_fd = -1;
app_comm_pipe_w_fd = -1;
if(app_term) {
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
@ -527,3 +534,81 @@ static int snapc_full_app_ckpt_handshake_end(int cr_state)
cleanup:
return exit_status;
}
int app_coord_ft_event(int state) {
int exit_status = ORTE_SUCCESS;
char *tmp_pid = NULL;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
; /* Nothing */
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
; /* Nothing */
}
/******** Restart Pre-Recovery ********/
else if (OPAL_CRS_RESTART_PRE == state ) {
;
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
"App) Initalized for Application %s (Restart)\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if( 0 <= app_comm_pipe_r_fd ) {
close(app_comm_pipe_r_fd);
app_comm_pipe_r_fd = -1;
}
if( 0 <= app_comm_pipe_w_fd ) {
close(app_comm_pipe_w_fd);
app_comm_pipe_w_fd = -1;
}
if( NULL != app_comm_pipe_r ) {
remove(app_comm_pipe_r);
free(app_comm_pipe_r);
app_comm_pipe_r = NULL;
}
if( NULL != app_comm_pipe_w ) {
remove(app_comm_pipe_w);
free(app_comm_pipe_w);
app_comm_pipe_w = NULL;
}
/* String representation of the PID */
asprintf(&tmp_pid, "%d", getpid());
asprintf(&app_comm_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
asprintf(&app_comm_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
/*
* Setup a signal handler to catch and start the proper thread
* to handle the checkpoint
*/
if( SIG_ERR == signal(opal_cr_entry_point_signal, snapc_full_app_signal_handler) ) {
opal_output(mca_snapc_full_component.super.output_handle,
"App) init: Error: Failed to register signal %d\n",
opal_cr_entry_point_signal);
exit_status = OPAL_ERROR;
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((15, mca_snapc_full_component.super.output_handle,
"App) Named Pipes (%s) (%s), Signal (%d)",
app_comm_pipe_r, app_comm_pipe_w, opal_cr_entry_point_signal));
}
/******** Termination ********/
else if (OPAL_CRS_TERM == state ) {
; /* Nothing */
}
/******** Error State ********/
else {
; /* Nothing */
}
cleanup:
return exit_status;
}

Просмотреть файл

@ -46,7 +46,7 @@ static orte_snapc_base_module_t loc_module = {
orte_snapc_full_module_finalize,
orte_snapc_full_setup_job,
orte_snapc_full_release_job,
orte_snapc_base_none_ft_event
orte_snapc_full_ft_event
};
/*
@ -265,6 +265,23 @@ int orte_snapc_full_release_job(orte_jobid_t jobid) {
return exit_status;
}
int orte_snapc_full_ft_event(int state) {
switch(orte_snapc_coord_type)
{
case ORTE_SNAPC_GLOBAL_COORD_TYPE:
case ORTE_SNAPC_LOCAL_COORD_TYPE:
; /* Do nothing */
break;
case ORTE_SNAPC_APP_COORD_TYPE:
return app_coord_ft_event(state);
break;
default:
break;
}
return ORTE_SUCCESS;
}
/******************
* Local functions
******************/