1
1

This commit changes the use of the deprecated cr_request_file() to use the cr_request_checkpoint() interface to BLCR. Additional configure checks are added to use the best available checkpointing interface available for the BLCR installed on the system (default: cr_request_checkpoint()).

This commit fixes trac:1691

Thanks to Matthias Hovestadt for identifying this issue.

This commit was SVN r20114.

The following Trac tickets were found above:
  Ticket 1691 --> https://svn.open-mpi.org/trac/ompi/ticket/1691
Этот коммит содержится в:
Josh Hursey 2008-12-11 00:08:34 +00:00
родитель 6a5454b76a
Коммит ce8d18bfda
2 изменённых файлов: 104 добавлений и 18 удалений

Просмотреть файл

@ -106,28 +106,51 @@ AC_DEFUN([MCA_crs_blcr_CONFIG],[
#
# Check for version >= 0.6.0 which has:
# Check for version difference which may have:
# - working cr_request_file
# - working cr_request_checkpoint (which should be used instead of cr_request_file)
# - 'requester' parameter to checkpoint_info
#
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
#
# First look for the cr_request_file function
#
crs_blcr_have_working_cr_request=0
AC_MSG_CHECKING(for BLCR working cr_request)
AC_TRY_COMPILE([#include <libcr.h>],
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
#error Version earlier than 0.6.0
#endif
],
[crs_blcr_have_working_cr_request=1
AC_MSG_RESULT([yes])],
OMPI_CHECK_FUNC_LIB([cr_request_file],[cr],
[AC_TRY_COMPILE([#include <libcr.h>],
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
#error Version earlier than 0.6.0
#endif
],
[crs_blcr_have_working_cr_request=1
],
[crs_blcr_have_working_cr_request=0
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request_file])
])],
[crs_blcr_have_working_cr_request=0
AC_MSG_RESULT([no])
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request])
])
AC_MSG_WARN([This BLCR version does not contain the cr_request_file function])
])
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
[BLCR cr_request check])
[BLCR cr_request_file check])
#
# Look for the cr_request_checkpoint function
#
crs_blcr_have_cr_request_checkpoint=0
AC_MSG_CHECKING(for BLCR cr_request_checkpoint)
OMPI_CHECK_FUNC_LIB([cr_request_checkpoint],[cr],
[crs_blcr_have_cr_request_checkpoint=1
],
[crs_blcr_have_cr_request_checkpoint=0
AC_MSG_WARN([This BLCR version does not contain the cr_request_checkpoint function])
])
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT], [$crs_blcr_have_cr_request_checkpoint],
[BLCR cr_request_checkpoint check])
#
# Look for the cr_checkpoint_info.requester member
#
crs_blcr_have_info_requester=0
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
[crs_blcr_have_info_requester=1],

Просмотреть файл

@ -25,6 +25,8 @@
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "opal/util/output.h"
#include "opal/util/show_help.h"
@ -272,6 +274,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
{
int ret, exit_status = OPAL_SUCCESS;
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
cr_checkpoint_args_t cr_args;
static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
#endif
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(%d, ---)", pid);
@ -301,13 +307,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
}
/*
* If we can checkpointing ourselves do so
* Note:
* If threading based checkpoint is enabled we cannot use the cr_request()
* function to checkpoint ourselves. If we are a thread, then it is likely
* that we have not properly initalized this module.
* If we can checkpointing ourselves do so:
* use cr_request_checkpoint() if available, and cr_request_file() if not
*/
#if CRS_BLCR_HAVE_CR_REQUEST == 1
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
if( pid == my_pid ) {
char *loc_fname = NULL;
@ -318,6 +321,65 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
"crs:blcr: checkpoint SELF <%s>",
loc_fname);
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
{
int fd = 0;
fd = open(loc_fname,
O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
S_IRUSR | S_IWUSR);
if( fd < 0 ) {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
loc_fname, pid);
exit_status = ret;
goto cleanup;
}
cr_initialize_checkpoint_args_t(&cr_args);
cr_args.cr_scope = CR_SCOPE_PROC;
cr_args.cr_fd = fd;
ret = cr_request_checkpoint(&cr_args, &cr_handle);
if( ret < 0 ) {
close(cr_args.cr_fd);
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
pid, loc_fname);
exit_status = ret;
goto cleanup;
}
/* Wait for checkpoint to finish */
do {
ret = cr_poll_checkpoint(&cr_handle, NULL);
if( ret < 0 ) {
/* Check if restarting. This is not an error. */
if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
ret = 0;
break;
}
/* If Call was interrupted by a signal, retry the call */
else if (errno == EINTR) {
;
}
/* Otherwise this is a real error that we need to deal with */
else {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
pid, loc_fname, ret);
exit_status = ret;
goto cleanup;
}
}
} while( ret < 0 );
/* Close the file */
close(cr_args.cr_fd);
}
#else
/* Request a checkpoint be taken of the current process.
* Since we are not guaranteed to finish the checkpoint before this
* returns, we also need to wait for it.
@ -328,6 +390,7 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
do {
usleep(1000); /* JJH Do we really want to sleep? */
} while(CR_STATE_IDLE != cr_status());
#endif
*state = blcr_current_state;
free(loc_fname);