CRS/CRIU: add code to actually checkpoint a process
This adds the code to actually checkpoint a process using CRIU with the necessary variables to control the behaviour. Right now only --np 1 is supported and --mca oob tcp. Following parameters are supported: * crs_criu_log: name of the log file * crs_criu_log_level: verbosity level in the log file * crs_criu_tcp_established: C/R established TCP connections * crs_criu_shell_job: C/R shell jobs * crs_criu_ext_unix_sk: allow external unix connections * crs_criu_leave_running: leave tasks in running state after checkpoint This commit was SVN r30772.
Этот коммит содержится в:
родитель
04172e47c3
Коммит
d7734ac6d8
@ -36,10 +36,25 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define LOG_FILE ("criu.log")
|
||||
|
||||
/* Local Component structures */
|
||||
struct opal_crs_criu_component_t {
|
||||
/** Base CRS component */
|
||||
/* Base CRS component */
|
||||
opal_crs_base_component_t super;
|
||||
|
||||
/* criu log file */
|
||||
char *log_file;
|
||||
/* criu log level */
|
||||
int log_level;
|
||||
/* criu tcp established */
|
||||
bool tcp_established;
|
||||
/* criu shell job */
|
||||
bool shell_job;
|
||||
/* criu external unix sockets */
|
||||
bool ext_unix_sk;
|
||||
/* criu leave tasks in running state after checkpoint */
|
||||
bool leave_running;
|
||||
};
|
||||
typedef struct opal_crs_criu_component_t opal_crs_criu_component_t;
|
||||
|
||||
|
@ -64,7 +64,19 @@ opal_crs_criu_component_t mca_crs_criu_component = {
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1
|
||||
}
|
||||
},
|
||||
/* criu log file */
|
||||
LOG_FILE,
|
||||
/* criu log level */
|
||||
0,
|
||||
/* criu tcp established */
|
||||
true,
|
||||
/* criu shell job */
|
||||
true,
|
||||
/* criu external unix sockets */
|
||||
true,
|
||||
/* criu leave tasks in running state after checkpoint */
|
||||
true
|
||||
};
|
||||
|
||||
static int crs_criu_register(void)
|
||||
@ -90,11 +102,72 @@ static int crs_criu_register(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.super.verbose);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "log", "Name of CRIU logfile (default: criu.log)",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.log_file);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "log_level",
|
||||
"Verbose level for the CRS criu component (default: 0)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.log_level);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "tcp_established",
|
||||
"Checkpoint/restore established TCP connections (default: true)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.tcp_established);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "shell_job",
|
||||
"Allow to dump and restore shell jobs (default: true)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.shell_job);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "ext_unix_sk",
|
||||
"Allow external unix connections (default: true)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.ext_unix_sk);
|
||||
|
||||
if (0 > ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_base_component_var_register(component, "leave_running",
|
||||
"Leave tasks in running state after checkpoint (default: true)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_crs_criu_component.leave_running);
|
||||
|
||||
return (0 > ret) ? ret : OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int crs_criu_open(void)
|
||||
{
|
||||
int oh;
|
||||
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
@ -106,17 +179,27 @@ static int crs_criu_open(void)
|
||||
mca_crs_criu_component.super.output_handle = opal_crs_base_framework.framework_output;
|
||||
}
|
||||
|
||||
oh = mca_crs_criu_component.super.output_handle;
|
||||
/*
|
||||
* Debug output
|
||||
*/
|
||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
||||
"crs:criu: open()");
|
||||
opal_output_verbose(20, mca_crs_criu_component.super.output_handle,
|
||||
"crs:criu: open: priority = %d",
|
||||
opal_output_verbose(10, oh, "crs:criu: open()");
|
||||
opal_output_verbose(20, oh, "crs:criu: open: priority = %d",
|
||||
mca_crs_criu_component.super.priority);
|
||||
opal_output_verbose(20, mca_crs_criu_component.super.output_handle,
|
||||
"crs:criu: open: verbosity = %d",
|
||||
opal_output_verbose(20, oh, "crs:criu: open: verbosity = %d",
|
||||
mca_crs_criu_component.super.verbose);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: log_file = %s",
|
||||
mca_crs_criu_component.log_file);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: log_level = %d",
|
||||
mca_crs_criu_component.log_level);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: tcp_established = %d",
|
||||
mca_crs_criu_component.tcp_established);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: shell_job = %d",
|
||||
mca_crs_criu_component.shell_job);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: ext_unix_sk = %d",
|
||||
mca_crs_criu_component.ext_unix_sk);
|
||||
opal_output_verbose(20, oh, "crs:criu: open: leave_running = %d",
|
||||
mca_crs_criu_component.leave_running);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -20,6 +20,11 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -63,7 +68,6 @@ OBJ_CLASS_DECLARATION(opal_crs_criu_snapshot_t);
|
||||
struct opal_crs_criu_snapshot_t {
|
||||
/* Base CRS snapshot type */
|
||||
opal_crs_base_snapshot_t super;
|
||||
char *context_filename;
|
||||
};
|
||||
typedef struct opal_crs_criu_snapshot_t opal_crs_criu_snapshot_t;
|
||||
|
||||
@ -77,16 +81,11 @@ OBJ_CLASS_INSTANCE(opal_crs_criu_snapshot_t,
|
||||
|
||||
void opal_crs_criu_construct(opal_crs_criu_snapshot_t *snapshot)
|
||||
{
|
||||
snapshot->context_filename = NULL;
|
||||
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
|
||||
}
|
||||
|
||||
void opal_crs_criu_destruct(opal_crs_criu_snapshot_t *snapshot)
|
||||
{
|
||||
if (NULL != snapshot->context_filename) {
|
||||
free(snapshot->context_filename);
|
||||
snapshot->context_filename = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int opal_crs_criu_component_query(mca_base_module_t **module, int *priority)
|
||||
@ -116,24 +115,110 @@ int opal_crs_criu_module_finalize(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static void criu_error(int ret, pid_t pid)
|
||||
{
|
||||
switch (ret) {
|
||||
case -EBADE:
|
||||
opal_output(0, "crs:criu:(PID:%d):RPC has returned fail", pid);
|
||||
break;
|
||||
case -ECONNREFUSED:
|
||||
opal_output(0, "crs:criu:(PID:%d):Unable to connect to CRIU", pid);
|
||||
break;
|
||||
case -ECOMM:
|
||||
opal_output(0, "crs:criu:(PID:%d):Unable to send/recv msg to/from CRIU", pid);
|
||||
break;
|
||||
case -EINVAL:
|
||||
opal_output(0, "crs:criu:(PID:%d):CRIU doesn't support this type of request."
|
||||
"You should probably update CRIU", pid);
|
||||
break;
|
||||
case -EBADMSG:
|
||||
opal_output(0, "crs:criu:(PID:%d):Unexpected response from CRIU."
|
||||
"You should probably update CRIU", pid);
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "crs:criu:(PID:%d):Unknown error type code."
|
||||
"You should probably update CRIU", pid);
|
||||
}
|
||||
}
|
||||
|
||||
int opal_crs_criu_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
||||
opal_crs_base_ckpt_options_t *options,
|
||||
opal_crs_state_type_t *state)
|
||||
{
|
||||
int ret;
|
||||
int fd = 0;
|
||||
int oh = mca_crs_criu_component.super.output_handle;
|
||||
opal_crs_criu_snapshot_t *snapshot = NULL;
|
||||
char *dest = NULL;
|
||||
|
||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
||||
"crs:criu: checkpoint(%d, ---)", pid);
|
||||
opal_output_verbose(10, oh, "crs:criu: checkpoint(%d, ---)", pid);
|
||||
|
||||
snapshot = (opal_crs_criu_snapshot_t *)base_snapshot;
|
||||
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
|
||||
|
||||
if (NULL == snapshot->super.metadata) {
|
||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a"))) {
|
||||
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open the file (%s)",
|
||||
snapshot->super.metadata_filename);
|
||||
*state = OPAL_CRS_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
|
||||
|
||||
fclose(snapshot->super.metadata);
|
||||
snapshot->super.metadata = NULL;
|
||||
|
||||
ret = criu_init_opts();
|
||||
|
||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
||||
"crs:criu: criu_init_opts() returned %d", ret);
|
||||
if (ret < 0) {
|
||||
criu_error(ret, pid);
|
||||
*state = OPAL_CRS_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, oh, "crs:criu: criu_init_opts() returned %d", ret);
|
||||
|
||||
dest = snapshot->super.snapshot_directory;
|
||||
opal_output_verbose(10, oh, "crs:criu: opening snapshot directory %s", dest);
|
||||
fd = open(dest, O_DIRECTORY);
|
||||
|
||||
if (fd < 0) {
|
||||
*state = OPAL_CRS_ERROR;
|
||||
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open checkpoint "
|
||||
"directory (%s) for pid (%d)", dest, pid);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* http://criu.org/C_API */
|
||||
criu_set_images_dir_fd(fd);
|
||||
criu_set_pid(pid);
|
||||
|
||||
criu_set_log_file(mca_crs_criu_component.log_file);
|
||||
criu_set_log_level(mca_crs_criu_component.log_level);
|
||||
criu_set_tcp_established(mca_crs_criu_component.tcp_established);
|
||||
criu_set_shell_job(mca_crs_criu_component.shell_job);
|
||||
criu_set_ext_unix_sk(mca_crs_criu_component.ext_unix_sk);
|
||||
criu_set_leave_running(mca_crs_criu_component.leave_running);
|
||||
ret = criu_dump();
|
||||
|
||||
if (ret < 0) {
|
||||
criu_error(ret, pid);
|
||||
*state = OPAL_CRS_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
*state = OPAL_CRS_CONTINUE;
|
||||
|
||||
cleanup:
|
||||
|
||||
if (fd > 0) {
|
||||
close(fd);
|
||||
}
|
||||
|
||||
if (OPAL_CRS_ERROR == *state) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user