1
1

Remove an old workaround in which we had to 'mv' the checkpoint file after it

was taken form the $CWD to the storage directory. Now we just store directly
to the storage directory which can reduce NFS traffic if working in that mode.

A slight performance boost, but at the point you are using NFS you are paying
a penalty anyway. Now you just don't have to pay it twice :)

This commit was SVN r16099.
Этот коммит содержится в:
Josh Hursey 2007-09-12 15:03:21 +00:00
родитель f80ea093a2
Коммит b4735c9719

Просмотреть файл

@ -85,13 +85,13 @@ OBJ_CLASS_INSTANCE(opal_crs_blcr_snapshot_t,
/****************** /******************
* Local Functions * Local Functions
******************/ ******************/
static int blcr_move(char *src, char *dest); static int blcr_chmod(char *dest);
static int blcr_checkpoint_peer(pid_t pid, char ** fname); static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname);
static int blcr_get_checkpoint_filename(char **fname, pid_t pid); static int blcr_get_checkpoint_filename(char **fname, pid_t pid);
static int opal_crs_blcr_thread_callback(void *arg); static int opal_crs_blcr_thread_callback(void *arg);
static int opal_crs_blcr_signal_callback(void *arg); static int opal_crs_blcr_signal_callback(void *arg);
static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char **fname, char **cmd); static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char * local_dir, char **fname, char **cmd);
static int opal_crs_blcr_restart_cmd(char *fname, char **cmd); static int opal_crs_blcr_restart_cmd(char *fname, char **cmd);
static int blcr_update_snapshot_metadata(opal_crs_blcr_snapshot_t *snapshot); static int blcr_update_snapshot_metadata(opal_crs_blcr_snapshot_t *snapshot);
@ -293,7 +293,7 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
* Checkpointing another process * Checkpointing another process
*/ */
{ {
ret = blcr_checkpoint_peer(pid, &(snapshot->context_filename)); ret = blcr_checkpoint_peer(pid, snapshot->super.local_location, &(snapshot->context_filename));
if(OPAL_SUCCESS != ret) { if(OPAL_SUCCESS != ret) {
*state = OPAL_CRS_ERROR; *state = OPAL_CRS_ERROR;
@ -312,10 +312,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
* Update the snapshot structure * Update the snapshot structure
*/ */
asprintf(&tmp_str, "%s/%s", snapshot->super.local_location, snapshot->context_filename); asprintf(&tmp_str, "%s/%s", snapshot->super.local_location, snapshot->context_filename);
if (0 != (ret = blcr_move(snapshot->context_filename, tmp_str))) { if (0 != (ret = blcr_chmod(tmp_str))) {
*state = OPAL_CRS_ERROR; *state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle, opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to move the checkpoint file (%s to the approprate directory (%s) :[%d].", "crs:blcr: checkpoint(): Error: Unable to chmod the checkpoint file (%s in the directory (%s) :[%d].",
snapshot->context_filename, tmp_str, ret); snapshot->context_filename, tmp_str, ret);
perror("crs:blcr: checkpoint"); perror("crs:blcr: checkpoint");
free(tmp_str); free(tmp_str);
@ -484,7 +484,7 @@ int opal_crs_blcr_enable_checkpoint(void)
/***************************** /*****************************
* Local Function Definitions * Local Function Definitions
*****************************/ *****************************/
static int blcr_checkpoint_peer(pid_t pid, char ** fname) static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname)
{ {
char **cr_argv = NULL; char **cr_argv = NULL;
char *cr_cmd = NULL; char *cr_cmd = NULL;
@ -499,7 +499,7 @@ static int blcr_checkpoint_peer(pid_t pid, char ** fname)
/* /*
* Get the checkpoint command * Get the checkpoint command
*/ */
if ( OPAL_SUCCESS != (ret = opal_crs_blcr_checkpoint_cmd(pid, fname, &cr_cmd)) ) { if ( OPAL_SUCCESS != (ret = opal_crs_blcr_checkpoint_cmd(pid, local_dir, fname, &cr_cmd)) ) {
opal_output(mca_crs_blcr_component.super.output_handle, opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint_peer: Failed to generate checkpoint command :(%d):", ret); "crs:blcr: checkpoint_peer: Failed to generate checkpoint command :(%d):", ret);
exit_status = ret; exit_status = ret;
@ -620,18 +620,21 @@ static int opal_crs_blcr_signal_callback(void *arg) {
return 0; return 0;
} }
static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char **fname, char **cmd) static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char * local_dir, char **fname, char **cmd)
{ {
char **cr_argv = NULL; char **cr_argv = NULL;
int argc = 0, ret; int argc = 0, ret;
char * pid_str; char * pid_str;
int exit_status = OPAL_SUCCESS; int exit_status = OPAL_SUCCESS;
char * loc_fname = NULL;
blcr_get_checkpoint_filename(fname, pid); blcr_get_checkpoint_filename(fname, pid);
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint_cmd(%d)", pid); "crs:blcr: checkpoint_cmd(%d)", pid);
asprintf(&loc_fname, "%s/%s", local_dir, *fname);
/* /*
* Build the command * Build the command
*/ */
@ -656,7 +659,7 @@ static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char **fname, char **cmd)
goto cleanup; goto cleanup;
} }
if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup(*fname)))) { if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup(loc_fname)))) {
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -671,6 +674,8 @@ static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char **fname, char **cmd)
free(pid_str); free(pid_str);
if( NULL != cr_argv) if( NULL != cr_argv)
opal_argv_free(cr_argv); opal_argv_free(cr_argv);
if(NULL != loc_fname)
free(loc_fname);
return exit_status; return exit_status;
} }
@ -806,26 +811,12 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
} }
/* /*
* As it turns out 'rename' doesn't work across filesystems :( * Change permissions on the file so we can read it
* So just do the system call to mv for the moment,
*/ */
static int blcr_move(char *src, char *dest) { static int blcr_chmod(char *dest) {
char * command = NULL; char * command = NULL;
int ret = OPAL_SUCCESS; int ret = OPAL_SUCCESS;
/* JJH: Assume 'mv' is in the path */
asprintf(&command, "mv %s %s", src, dest);
if (0 != (ret = system(command) )) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: move(): Error: Unable to move the file (%s) to (%s) :[%d].",
src, dest, ret);
perror("crs:blcr move");
goto error;
}
free(command);
/* JJH: Assume 'chmod' is in the path */ /* JJH: Assume 'chmod' is in the path */
asprintf(&command, "chmod u+rwX %s", dest); asprintf(&command, "chmod u+rwX %s", dest);
@ -838,6 +829,8 @@ static int blcr_move(char *src, char *dest) {
} }
error: error:
if( NULL != command ) {
free(command); free(command);
}
return ret; return ret;
} }