This commit changes the use of the deprecated cr_request_file() to use the cr_request_checkpoint() interface to BLCR. Additional configure checks are added to use the best available checkpointing interface available for the BLCR installed on the system (default: cr_request_checkpoint()).
This commit fixes trac:1691 Thanks to Matthias Hovestadt for identifying this issue. This commit was SVN r20114. The following Trac tickets were found above: Ticket 1691 --> https://svn.open-mpi.org/trac/ompi/ticket/1691
Этот коммит содержится в:
родитель
6a5454b76a
Коммит
ce8d18bfda
@ -106,28 +106,51 @@ AC_DEFUN([MCA_crs_blcr_CONFIG],[
|
||||
|
||||
|
||||
#
|
||||
# Check for version >= 0.6.0 which has:
|
||||
# Check for version difference which may have:
|
||||
# - working cr_request_file
|
||||
# - working cr_request_checkpoint (which should be used instead of cr_request_file)
|
||||
# - 'requester' parameter to checkpoint_info
|
||||
#
|
||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
|
||||
#
|
||||
# First look for the cr_request_file function
|
||||
#
|
||||
crs_blcr_have_working_cr_request=0
|
||||
AC_MSG_CHECKING(for BLCR working cr_request)
|
||||
AC_TRY_COMPILE([#include <libcr.h>],
|
||||
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
|
||||
#error Version earlier than 0.6.0
|
||||
#endif
|
||||
],
|
||||
[crs_blcr_have_working_cr_request=1
|
||||
AC_MSG_RESULT([yes])],
|
||||
OMPI_CHECK_FUNC_LIB([cr_request_file],[cr],
|
||||
[AC_TRY_COMPILE([#include <libcr.h>],
|
||||
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
|
||||
#error Version earlier than 0.6.0
|
||||
#endif
|
||||
],
|
||||
[crs_blcr_have_working_cr_request=1
|
||||
],
|
||||
[crs_blcr_have_working_cr_request=0
|
||||
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request_file])
|
||||
])],
|
||||
[crs_blcr_have_working_cr_request=0
|
||||
AC_MSG_RESULT([no])
|
||||
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request])
|
||||
])
|
||||
AC_MSG_WARN([This BLCR version does not contain the cr_request_file function])
|
||||
])
|
||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
|
||||
[BLCR cr_request check])
|
||||
[BLCR cr_request_file check])
|
||||
|
||||
#
|
||||
# Look for the cr_request_checkpoint function
|
||||
#
|
||||
crs_blcr_have_cr_request_checkpoint=0
|
||||
AC_MSG_CHECKING(for BLCR cr_request_checkpoint)
|
||||
OMPI_CHECK_FUNC_LIB([cr_request_checkpoint],[cr],
|
||||
[crs_blcr_have_cr_request_checkpoint=1
|
||||
],
|
||||
[crs_blcr_have_cr_request_checkpoint=0
|
||||
AC_MSG_WARN([This BLCR version does not contain the cr_request_checkpoint function])
|
||||
])
|
||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT], [$crs_blcr_have_cr_request_checkpoint],
|
||||
[BLCR cr_request_checkpoint check])
|
||||
|
||||
#
|
||||
# Look for the cr_checkpoint_info.requester member
|
||||
#
|
||||
crs_blcr_have_info_requester=0
|
||||
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
|
||||
[crs_blcr_have_info_requester=1],
|
||||
|
@ -25,6 +25,8 @@
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
@ -272,6 +274,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
{
|
||||
int ret, exit_status = OPAL_SUCCESS;
|
||||
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
||||
cr_checkpoint_args_t cr_args;
|
||||
static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
|
||||
#endif
|
||||
|
||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
||||
"crs:blcr: checkpoint(%d, ---)", pid);
|
||||
@ -301,13 +307,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
}
|
||||
|
||||
/*
|
||||
* If we can checkpointing ourselves do so
|
||||
* Note:
|
||||
* If threading based checkpoint is enabled we cannot use the cr_request()
|
||||
* function to checkpoint ourselves. If we are a thread, then it is likely
|
||||
* that we have not properly initalized this module.
|
||||
* If we can checkpointing ourselves do so:
|
||||
* use cr_request_checkpoint() if available, and cr_request_file() if not
|
||||
*/
|
||||
#if CRS_BLCR_HAVE_CR_REQUEST == 1
|
||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
|
||||
if( pid == my_pid ) {
|
||||
char *loc_fname = NULL;
|
||||
|
||||
@ -318,6 +321,65 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
"crs:blcr: checkpoint SELF <%s>",
|
||||
loc_fname);
|
||||
|
||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
||||
{
|
||||
int fd = 0;
|
||||
fd = open(loc_fname,
|
||||
O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
|
||||
S_IRUSR | S_IWUSR);
|
||||
if( fd < 0 ) {
|
||||
*state = OPAL_CRS_ERROR;
|
||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||
"crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
|
||||
loc_fname, pid);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cr_initialize_checkpoint_args_t(&cr_args);
|
||||
cr_args.cr_scope = CR_SCOPE_PROC;
|
||||
cr_args.cr_fd = fd;
|
||||
|
||||
ret = cr_request_checkpoint(&cr_args, &cr_handle);
|
||||
if( ret < 0 ) {
|
||||
close(cr_args.cr_fd);
|
||||
*state = OPAL_CRS_ERROR;
|
||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
|
||||
pid, loc_fname);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Wait for checkpoint to finish */
|
||||
do {
|
||||
ret = cr_poll_checkpoint(&cr_handle, NULL);
|
||||
if( ret < 0 ) {
|
||||
/* Check if restarting. This is not an error. */
|
||||
if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
/* If Call was interrupted by a signal, retry the call */
|
||||
else if (errno == EINTR) {
|
||||
;
|
||||
}
|
||||
/* Otherwise this is a real error that we need to deal with */
|
||||
else {
|
||||
*state = OPAL_CRS_ERROR;
|
||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
|
||||
pid, loc_fname, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
} while( ret < 0 );
|
||||
|
||||
/* Close the file */
|
||||
close(cr_args.cr_fd);
|
||||
}
|
||||
#else
|
||||
/* Request a checkpoint be taken of the current process.
|
||||
* Since we are not guaranteed to finish the checkpoint before this
|
||||
* returns, we also need to wait for it.
|
||||
@ -328,6 +390,7 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
do {
|
||||
usleep(1000); /* JJH Do we really want to sleep? */
|
||||
} while(CR_STATE_IDLE != cr_status());
|
||||
#endif
|
||||
|
||||
*state = blcr_current_state;
|
||||
free(loc_fname);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user