This commit changes the use of the deprecated cr_request_file() to use the cr_request_checkpoint() interface to BLCR. Additional configure checks are added to use the best available checkpointing interface available for the BLCR installed on the system (default: cr_request_checkpoint()).
This commit fixes trac:1691 Thanks to Matthias Hovestadt for identifying this issue. This commit was SVN r20114. The following Trac tickets were found above: Ticket 1691 --> https://svn.open-mpi.org/trac/ompi/ticket/1691
Этот коммит содержится в:
родитель
6a5454b76a
Коммит
ce8d18bfda
@ -106,28 +106,51 @@ AC_DEFUN([MCA_crs_blcr_CONFIG],[
|
|||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Check for version >= 0.6.0 which has:
|
# Check for version difference which may have:
|
||||||
# - working cr_request_file
|
# - working cr_request_file
|
||||||
|
# - working cr_request_checkpoint (which should be used instead of cr_request_file)
|
||||||
# - 'requester' parameter to checkpoint_info
|
# - 'requester' parameter to checkpoint_info
|
||||||
#
|
#
|
||||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
|
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
|
||||||
|
#
|
||||||
|
# First look for the cr_request_file function
|
||||||
|
#
|
||||||
crs_blcr_have_working_cr_request=0
|
crs_blcr_have_working_cr_request=0
|
||||||
AC_MSG_CHECKING(for BLCR working cr_request)
|
AC_MSG_CHECKING(for BLCR working cr_request)
|
||||||
AC_TRY_COMPILE([#include <libcr.h>],
|
OMPI_CHECK_FUNC_LIB([cr_request_file],[cr],
|
||||||
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
|
[AC_TRY_COMPILE([#include <libcr.h>],
|
||||||
#error Version earlier than 0.6.0
|
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
|
||||||
#endif
|
#error Version earlier than 0.6.0
|
||||||
],
|
#endif
|
||||||
[crs_blcr_have_working_cr_request=1
|
],
|
||||||
AC_MSG_RESULT([yes])],
|
[crs_blcr_have_working_cr_request=1
|
||||||
|
],
|
||||||
|
[crs_blcr_have_working_cr_request=0
|
||||||
|
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request_file])
|
||||||
|
])],
|
||||||
[crs_blcr_have_working_cr_request=0
|
[crs_blcr_have_working_cr_request=0
|
||||||
AC_MSG_RESULT([no])
|
AC_MSG_WARN([This BLCR version does not contain the cr_request_file function])
|
||||||
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request])
|
])
|
||||||
])
|
|
||||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
|
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
|
||||||
[BLCR cr_request check])
|
[BLCR cr_request_file check])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Look for the cr_request_checkpoint function
|
||||||
|
#
|
||||||
|
crs_blcr_have_cr_request_checkpoint=0
|
||||||
|
AC_MSG_CHECKING(for BLCR cr_request_checkpoint)
|
||||||
|
OMPI_CHECK_FUNC_LIB([cr_request_checkpoint],[cr],
|
||||||
|
[crs_blcr_have_cr_request_checkpoint=1
|
||||||
|
],
|
||||||
|
[crs_blcr_have_cr_request_checkpoint=0
|
||||||
|
AC_MSG_WARN([This BLCR version does not contain the cr_request_checkpoint function])
|
||||||
|
])
|
||||||
|
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT], [$crs_blcr_have_cr_request_checkpoint],
|
||||||
|
[BLCR cr_request_checkpoint check])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Look for the cr_checkpoint_info.requester member
|
||||||
|
#
|
||||||
crs_blcr_have_info_requester=0
|
crs_blcr_have_info_requester=0
|
||||||
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
|
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
|
||||||
[crs_blcr_have_info_requester=1],
|
[crs_blcr_have_info_requester=1],
|
||||||
|
@ -25,6 +25,8 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
@ -272,6 +274,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
{
|
{
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
||||||
|
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
||||||
|
cr_checkpoint_args_t cr_args;
|
||||||
|
static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
|
||||||
|
#endif
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
||||||
"crs:blcr: checkpoint(%d, ---)", pid);
|
"crs:blcr: checkpoint(%d, ---)", pid);
|
||||||
@ -301,13 +307,10 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we can checkpointing ourselves do so
|
* If we can checkpointing ourselves do so:
|
||||||
* Note:
|
* use cr_request_checkpoint() if available, and cr_request_file() if not
|
||||||
* If threading based checkpoint is enabled we cannot use the cr_request()
|
|
||||||
* function to checkpoint ourselves. If we are a thread, then it is likely
|
|
||||||
* that we have not properly initalized this module.
|
|
||||||
*/
|
*/
|
||||||
#if CRS_BLCR_HAVE_CR_REQUEST == 1
|
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
|
||||||
if( pid == my_pid ) {
|
if( pid == my_pid ) {
|
||||||
char *loc_fname = NULL;
|
char *loc_fname = NULL;
|
||||||
|
|
||||||
@ -318,6 +321,65 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
"crs:blcr: checkpoint SELF <%s>",
|
"crs:blcr: checkpoint SELF <%s>",
|
||||||
loc_fname);
|
loc_fname);
|
||||||
|
|
||||||
|
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
||||||
|
{
|
||||||
|
int fd = 0;
|
||||||
|
fd = open(loc_fname,
|
||||||
|
O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
|
||||||
|
S_IRUSR | S_IWUSR);
|
||||||
|
if( fd < 0 ) {
|
||||||
|
*state = OPAL_CRS_ERROR;
|
||||||
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
|
"crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
|
||||||
|
loc_fname, pid);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
cr_initialize_checkpoint_args_t(&cr_args);
|
||||||
|
cr_args.cr_scope = CR_SCOPE_PROC;
|
||||||
|
cr_args.cr_fd = fd;
|
||||||
|
|
||||||
|
ret = cr_request_checkpoint(&cr_args, &cr_handle);
|
||||||
|
if( ret < 0 ) {
|
||||||
|
close(cr_args.cr_fd);
|
||||||
|
*state = OPAL_CRS_ERROR;
|
||||||
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
|
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
|
||||||
|
pid, loc_fname);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Wait for checkpoint to finish */
|
||||||
|
do {
|
||||||
|
ret = cr_poll_checkpoint(&cr_handle, NULL);
|
||||||
|
if( ret < 0 ) {
|
||||||
|
/* Check if restarting. This is not an error. */
|
||||||
|
if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
|
||||||
|
ret = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* If Call was interrupted by a signal, retry the call */
|
||||||
|
else if (errno == EINTR) {
|
||||||
|
;
|
||||||
|
}
|
||||||
|
/* Otherwise this is a real error that we need to deal with */
|
||||||
|
else {
|
||||||
|
*state = OPAL_CRS_ERROR;
|
||||||
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
|
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
|
||||||
|
pid, loc_fname, ret);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while( ret < 0 );
|
||||||
|
|
||||||
|
/* Close the file */
|
||||||
|
close(cr_args.cr_fd);
|
||||||
|
}
|
||||||
|
#else
|
||||||
/* Request a checkpoint be taken of the current process.
|
/* Request a checkpoint be taken of the current process.
|
||||||
* Since we are not guaranteed to finish the checkpoint before this
|
* Since we are not guaranteed to finish the checkpoint before this
|
||||||
* returns, we also need to wait for it.
|
* returns, we also need to wait for it.
|
||||||
@ -328,6 +390,7 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
do {
|
do {
|
||||||
usleep(1000); /* JJH Do we really want to sleep? */
|
usleep(1000); /* JJH Do we really want to sleep? */
|
||||||
} while(CR_STATE_IDLE != cr_status());
|
} while(CR_STATE_IDLE != cr_status());
|
||||||
|
#endif
|
||||||
|
|
||||||
*state = blcr_current_state;
|
*state = blcr_current_state;
|
||||||
free(loc_fname);
|
free(loc_fname);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user