Bring over the jjh-filem branch which contains a non-blocking FileM interface
and implementation. This has shown drastic performance benefit when transferring Many files at roughly the same time. I tested this for many different filem operations and everything was working fine. Let me know if you have any problems with this functionality. Some Notes: - opal-checkpoint now has a 'quiet' flag to keep it from being too verbose. - FileM RSH component is fully non-blocking. - FileM RSH component has incomming connection throttling since by default ssh only allows 10 concurrent scp connections to any single host. This default can be adjusted via an MCA parameter. {{{-mca filem_rsh_max_incomming 10}}} - There is an MCA parameter for max outgoing connections, but it is currently not implemented. If someone needs it then it should not be hard to implement. {{{-mca filem_rsh_max_outgoing 10}}} - Changed the FileM request structure so that it is a bit more explicit and flexible. - Moved the 'preload-binary' and 'preload-files' functionality into odls/base allowing for code reuse in the 'process' and 'default' ODLS components. - Fixed a bug in the process name resolution which broke the 'preload-*' functionality due to GPR table structure changes. - The FileM RSH component might be able to see even more speedup from using a thread pool to operate on the work_pool structures, but that is for future work. - Added a 'opal-show-help' file to ODLS Base This commit was SVN r16252.
Этот коммит содержится в:
родитель
670956e172
Коммит
e10f476c87
@ -92,6 +92,7 @@ typedef struct {
|
||||
int pid;
|
||||
bool term;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
char *snapshot_name;
|
||||
char *snapshot_loc;
|
||||
int output;
|
||||
@ -112,6 +113,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
&opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Verbose" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'q', NULL, "quiet",
|
||||
0,
|
||||
&opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Super Quiet" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'\0', NULL, "term",
|
||||
0,
|
||||
@ -175,10 +182,12 @@ main(int argc, char *argv[])
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output(opal_checkpoint_globals.output,
|
||||
"Local Snapshot Reference = %s\n",
|
||||
fname);
|
||||
|
||||
if( !opal_checkpoint_globals.quiet ) {
|
||||
opal_output(opal_checkpoint_globals.output,
|
||||
"Local Snapshot Reference = %s\n",
|
||||
fname);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/***************
|
||||
@ -215,6 +224,7 @@ static int initialize(int argc, char *argv[]) {
|
||||
* Setup OPAL Output handle from the verbose argument
|
||||
*/
|
||||
if( opal_checkpoint_globals.verbose ) {
|
||||
opal_checkpoint_globals.quiet = false; /* Automaticly turn off quiet if it is set */
|
||||
opal_checkpoint_globals.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
|
||||
} else {
|
||||
@ -364,15 +374,21 @@ notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_
|
||||
*/
|
||||
if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
|
||||
/* File doesn't exist yet, keep waiting */
|
||||
opal_output(0, "opal-checkpoint: File does not exit yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
||||
prog_named_pipe_r, ret, s, max_wait_time);
|
||||
if( !opal_checkpoint_globals.quiet &&
|
||||
s >= max_wait_time - 5 ) {
|
||||
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
||||
prog_named_pipe_r, ret, s, max_wait_time);
|
||||
}
|
||||
sleep(1);
|
||||
continue;
|
||||
}
|
||||
else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
|
||||
/* File doesn't exist yet, keep waiting */
|
||||
opal_output(0, "opal-checkpoint: File does not exit yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
||||
prog_named_pipe_w, ret, s, max_wait_time);
|
||||
if( !opal_checkpoint_globals.quiet &&
|
||||
s >= max_wait_time - 5 ) {
|
||||
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
||||
prog_named_pipe_w, ret, s, max_wait_time);
|
||||
}
|
||||
sleep(1);
|
||||
continue;
|
||||
}
|
||||
|
@ -36,6 +36,12 @@ extern "C" {
|
||||
/**
|
||||
* FileM request object maintenance functions
|
||||
*/
|
||||
ORTE_DECLSPEC void orte_filem_base_process_set_construct(orte_filem_base_process_set_t *obj);
|
||||
ORTE_DECLSPEC void orte_filem_base_process_set_destruct( orte_filem_base_process_set_t *obj);
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *obj);
|
||||
ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t *obj);
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_construct(orte_filem_base_request_t *obj);
|
||||
ORTE_DECLSPEC void orte_filem_base_destruct( orte_filem_base_request_t *obj);
|
||||
|
||||
@ -89,8 +95,13 @@ extern "C" {
|
||||
int orte_filem_base_module_finalize(void);
|
||||
|
||||
int orte_filem_base_none_put(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_put_nb(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_get(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_get_nb(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_rm( orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_rm_nb( orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_wait( orte_filem_base_request_t *request);
|
||||
int orte_filem_base_none_wait_all( opal_list_t *request_list);
|
||||
|
||||
/**
|
||||
* Some utility functions
|
||||
@ -109,6 +120,11 @@ extern "C" {
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
/**
|
||||
* Setup request structure
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_filem_base_prepare_request(orte_filem_base_request_t *request, int move_type);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -48,37 +48,97 @@
|
||||
/******************
|
||||
* Object Stuff
|
||||
******************/
|
||||
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_process_set_t,
|
||||
opal_list_item_t,
|
||||
orte_filem_base_process_set_construct,
|
||||
orte_filem_base_process_set_destruct);
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_process_set_construct(orte_filem_base_process_set_t *req) {
|
||||
req->source = *ORTE_NAME_INVALID;
|
||||
req->sink = *ORTE_NAME_INVALID;
|
||||
}
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_process_set_destruct( orte_filem_base_process_set_t *req) {
|
||||
req->source = *ORTE_NAME_INVALID;
|
||||
req->sink = *ORTE_NAME_INVALID;
|
||||
}
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
|
||||
opal_list_item_t,
|
||||
orte_filem_base_file_set_construct,
|
||||
orte_filem_base_file_set_destruct);
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
|
||||
req->local_target = NULL;
|
||||
req->remote_target = NULL;
|
||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
|
||||
}
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t *req) {
|
||||
if( NULL != req->local_target ) {
|
||||
free(req->local_target);
|
||||
req->local_target = NULL;
|
||||
}
|
||||
|
||||
if( NULL != req->remote_target ) {
|
||||
free(req->remote_target);
|
||||
req->remote_target = NULL;
|
||||
}
|
||||
|
||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_request_t,
|
||||
opal_list_item_t,
|
||||
orte_filem_base_construct,
|
||||
orte_filem_base_destruct);
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_construct(orte_filem_base_request_t *req) {
|
||||
req->num_targets = 0;
|
||||
OBJ_CONSTRUCT(&req->process_sets, opal_list_t);
|
||||
OBJ_CONSTRUCT(&req->file_sets, opal_list_t);
|
||||
|
||||
req->local_targets = NULL;
|
||||
req->remote_targets = NULL;
|
||||
req->target_flags = NULL;
|
||||
req->num_mv = 0;
|
||||
|
||||
req->proc_name = NULL;
|
||||
req->num_procs = 0;
|
||||
req->is_done = NULL;
|
||||
req->is_active = NULL;
|
||||
|
||||
req->exit_status = NULL;
|
||||
|
||||
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_destruct( orte_filem_base_request_t *req) {
|
||||
int i;
|
||||
opal_list_item_t* item = NULL;
|
||||
|
||||
for(i = 0; i < req->num_targets; ++i) {
|
||||
free(req->local_targets[i]);
|
||||
free(req->remote_targets[i]);
|
||||
while( NULL != (item = opal_list_remove_first(&req->process_sets)) ) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
free(req->target_flags);
|
||||
|
||||
req->num_targets = 0;
|
||||
|
||||
free(req->proc_name);
|
||||
|
||||
req->num_procs = 0;
|
||||
OBJ_DESTRUCT(&req->process_sets);
|
||||
|
||||
while( NULL != (item = opal_list_remove_first(&req->file_sets)) ) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&req->file_sets);
|
||||
|
||||
req->num_mv = 0;
|
||||
|
||||
if( NULL != req->is_done ) {
|
||||
free(req->is_done);
|
||||
req->is_done = NULL;
|
||||
}
|
||||
|
||||
if( NULL != req->is_active ) {
|
||||
free(req->is_active);
|
||||
req->is_active = NULL;
|
||||
}
|
||||
|
||||
if( NULL != req->exit_status ) {
|
||||
free(req->exit_status);
|
||||
req->exit_status = NULL;
|
||||
}
|
||||
|
||||
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
/***********************
|
||||
@ -109,16 +169,41 @@ int orte_filem_base_none_put(orte_filem_base_request_t *request )
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_put_nb(orte_filem_base_request_t *request )
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_get(orte_filem_base_request_t *request)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_get_nb(orte_filem_base_request_t *request)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_rm(orte_filem_base_request_t *request)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_rm_nb(orte_filem_base_request_t *request)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_wait(orte_filem_base_request_t *request)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_filem_base_none_wait_all(opal_list_t *request_list)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
@ -156,15 +241,7 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
|
||||
/*
|
||||
* Contact GPR and get the 'orte-node-name' for this process
|
||||
*/
|
||||
/* if it is the root then we need a different key :/ */
|
||||
if(proc->jobid == 0 &&
|
||||
proc->vpid == 0) {
|
||||
keys[0] = ORTE_PROC_RML_CONTACT_KEY;
|
||||
}
|
||||
else {
|
||||
keys[0] = ORTE_NODE_NAME_KEY;
|
||||
}
|
||||
|
||||
keys[0] = ORTE_NODE_NAME_KEY;
|
||||
keys[1] = NULL;
|
||||
|
||||
/*
|
||||
@ -223,33 +300,6 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
|
||||
}
|
||||
}
|
||||
|
||||
if (proc->jobid == 0 && proc->vpid == 0) {
|
||||
/* we have contact info -- need an IP address. This assumes
|
||||
that we're using the OOB RML, but that's probably a safe
|
||||
enough assumption in here. */
|
||||
|
||||
char *contact_info = *machine_name;
|
||||
orte_process_name_t peer;
|
||||
char **uris;
|
||||
char *ip, *port;
|
||||
|
||||
*machine_name = NULL;
|
||||
ret = orte_rml_base_parse_uris(contact_info, &peer, &uris);
|
||||
free(contact_info);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ip = strrchr(uris[0], '/') + 1;
|
||||
port = strrchr(uris[0], ':');
|
||||
port[0] = '\0';
|
||||
|
||||
*machine_name = strdup(ip);
|
||||
|
||||
opal_argv_free(uris);
|
||||
}
|
||||
|
||||
if (NULL == *machine_name ){
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
@ -447,3 +497,49 @@ void orte_filem_base_query_callback(int status,
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int orte_filem_base_prepare_request(orte_filem_base_request_t *request, int move_type)
|
||||
{
|
||||
int num_reqs = 0, i = 0;
|
||||
|
||||
if( ORTE_FILEM_MOVE_TYPE_RM == move_type ) {
|
||||
num_reqs = opal_list_get_size(&request->process_sets);
|
||||
}
|
||||
else {
|
||||
num_reqs = opal_list_get_size(&request->process_sets) * opal_list_get_size(&request->file_sets);
|
||||
}
|
||||
|
||||
if( 0 >= num_reqs ) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
else {
|
||||
if( NULL != request->is_done ) {
|
||||
free(request->is_done);
|
||||
request->is_done = NULL;
|
||||
}
|
||||
|
||||
if( NULL != request->is_active ) {
|
||||
free(request->is_active);
|
||||
request->is_active = NULL;
|
||||
}
|
||||
|
||||
if( NULL != request->exit_status ) {
|
||||
free(request->exit_status);
|
||||
request->exit_status = NULL;
|
||||
}
|
||||
|
||||
request->num_mv = num_reqs;
|
||||
request->is_done = (bool*) malloc(sizeof(bool) * num_reqs);
|
||||
request->is_active = (bool*) malloc(sizeof(bool) * num_reqs);
|
||||
request->exit_status = (int32_t*) malloc(sizeof(int32_t) * num_reqs);
|
||||
for( i = 0; i < num_reqs; ++i) {
|
||||
request->is_done[i] = false;
|
||||
request->is_active[i] = false;
|
||||
request->exit_status[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
request->movement_type = move_type;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ int orte_filem_base_open(void)
|
||||
if(0 != value) {
|
||||
orte_filem_base_output = opal_output_open(NULL);
|
||||
} else {
|
||||
orte_filem_base_output = -1;
|
||||
orte_filem_base_output = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -47,6 +47,63 @@ extern "C" {
|
||||
#define ORTE_FILEM_TYPE_DIR 1
|
||||
#define ORTE_FILEM_TYPE_UNKNOWN 2
|
||||
|
||||
/**
|
||||
* Type of moment
|
||||
*/
|
||||
#define ORTE_FILEM_MOVE_TYPE_PUT 0
|
||||
#define ORTE_FILEM_MOVE_TYPE_GET 1
|
||||
#define ORTE_FILEM_MOVE_TYPE_RM 2
|
||||
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
|
||||
|
||||
/**
|
||||
* Define a Process Set
|
||||
*
|
||||
* Source: A single source of the operation.
|
||||
* Sink: Desitination of the operation.
|
||||
*/
|
||||
struct orte_filem_base_process_set_1_0_0_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** Source Process */
|
||||
orte_process_name_t source;
|
||||
|
||||
/** Sink Process */
|
||||
orte_process_name_t sink;
|
||||
};
|
||||
typedef struct orte_filem_base_process_set_1_0_0_t orte_filem_base_process_set_1_0_0_t;
|
||||
typedef struct orte_filem_base_process_set_1_0_0_t orte_filem_base_process_set_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_filem_base_process_set_t);
|
||||
|
||||
/**
|
||||
* Define a File Pair
|
||||
*
|
||||
* Local: Local file reference
|
||||
* Remove: Remote file reference
|
||||
*
|
||||
* Note: If multiple process sinks are used it is assumed that the
|
||||
* file reference is the same for each of the sinks. If this is not
|
||||
* true then more than one filem request needs to be created.
|
||||
*/
|
||||
struct orte_filem_base_file_set_1_0_0_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/* Local file reference */
|
||||
char * local_target;
|
||||
|
||||
/* Remove file reference */
|
||||
char * remote_target;
|
||||
|
||||
/* Type of file to move */
|
||||
int target_flag;
|
||||
};
|
||||
typedef struct orte_filem_base_file_set_1_0_0_t orte_filem_base_file_set_1_0_0_t;
|
||||
typedef struct orte_filem_base_file_set_1_0_0_t orte_filem_base_file_set_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_filem_base_file_set_t);
|
||||
|
||||
/**
|
||||
* Definition of a file movement request
|
||||
* This will allow:
|
||||
@ -54,33 +111,52 @@ extern "C" {
|
||||
* - to/from one or more processes
|
||||
* in a single call of the API function. Allowing the implementation
|
||||
* to optimize the sending/receiving of data.
|
||||
* Used for the following:
|
||||
*
|
||||
*/
|
||||
struct orte_filem_base_request_1_0_0_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** Number of targets in the:
|
||||
* - local_targets
|
||||
* - remote_targets
|
||||
* - target_flags
|
||||
* arrays. One variable since they
|
||||
* are required to be all the same length
|
||||
|
||||
/*
|
||||
* A list of process sets
|
||||
*/
|
||||
int num_targets;
|
||||
opal_list_t process_sets;
|
||||
|
||||
/** Local mapping of targets */
|
||||
char **local_targets;
|
||||
|
||||
/** Remote mapping of targets */
|
||||
char **remote_targets;
|
||||
/*
|
||||
* A list of file pairings
|
||||
*/
|
||||
opal_list_t file_sets;
|
||||
|
||||
/** For each target, a flag regarding its type */
|
||||
int *target_flags;
|
||||
/*
|
||||
* Internal use:
|
||||
* Number of movements
|
||||
*/
|
||||
int num_mv;
|
||||
|
||||
/** List of processes to send/receive from */
|
||||
orte_process_name_t *proc_name;
|
||||
/** Number of processes in that array */
|
||||
int num_procs;
|
||||
/*
|
||||
* Internal use:
|
||||
* Boolean to indianate if transfer is complete
|
||||
*/
|
||||
bool *is_done;
|
||||
|
||||
/*
|
||||
* Internal use:
|
||||
* Boolean to indianate if transfer is active
|
||||
*/
|
||||
bool *is_active;
|
||||
|
||||
/*
|
||||
* Internal use:
|
||||
* Exit status of the copy command
|
||||
*/
|
||||
int32_t *exit_status;
|
||||
|
||||
/*
|
||||
* Internal use:
|
||||
* Movement type
|
||||
*/
|
||||
int movement_type;
|
||||
};
|
||||
typedef struct orte_filem_base_request_1_0_0_t orte_filem_base_request_1_0_0_t;
|
||||
typedef struct orte_filem_base_request_1_0_0_t orte_filem_base_request_t;
|
||||
@ -125,6 +201,22 @@ typedef int (*orte_filem_base_module_finalize_fn_t)
|
||||
typedef int (*orte_filem_base_put_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Put a file or directory on the remote machine (Async)
|
||||
*
|
||||
* Note: By using a relative path for the remote file/directory, the filem
|
||||
* component will negotiate the correct absolute path for that file/directory
|
||||
* for the remote machine.
|
||||
*
|
||||
* @param request FileM request describing the files/directories to send,
|
||||
* the remote files/directories to use, and the processes to see the change.
|
||||
*
|
||||
* @return ORTE_SUCCESS on successful file transer
|
||||
* @return ORTE_ERROR on failed file transfer
|
||||
*/
|
||||
typedef int (*orte_filem_base_put_nb_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Get a file from the remote machine
|
||||
*
|
||||
@ -141,6 +233,22 @@ typedef int (*orte_filem_base_put_fn_t)
|
||||
typedef int (*orte_filem_base_get_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Get a file from the remote machine (Async)
|
||||
*
|
||||
* Note: By using a relative path for the remote file/directory, the filem
|
||||
* component will negotiate the correct absolute path for that file/directory
|
||||
* for the remote machine.
|
||||
*
|
||||
* @param request FileM request describing the files/directories to receive,
|
||||
* the remote files/directories to use, and the processes to see the change.
|
||||
*
|
||||
* @return ORTE_SUCCESS on successful file transer
|
||||
* @return ORTE_ERROR on failed file transfer
|
||||
*/
|
||||
typedef int (*orte_filem_base_get_nb_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Remove a file from the remote machine
|
||||
*
|
||||
@ -157,6 +265,50 @@ typedef int (*orte_filem_base_get_fn_t)
|
||||
typedef int (*orte_filem_base_rm_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Remove a file from the remote machine (Async)
|
||||
*
|
||||
* Note: By using a relative path for the remote file/directory, the filem
|
||||
* component will negotiate the correct absolute path for that file/directory
|
||||
* for the remote machine.
|
||||
*
|
||||
* @param request FileM request describing the remote files/directories to remove,
|
||||
* the processes to see the change.
|
||||
*
|
||||
* @return ORTE_SUCCESS on success
|
||||
* @return ORTE_ERROR on fail
|
||||
*/
|
||||
typedef int (*orte_filem_base_rm_nb_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Wait for a single file movement request to finish
|
||||
*
|
||||
* @param request FileM request describing the remote files/directories.
|
||||
*
|
||||
* The request must have been passed through one of the non-blocking functions
|
||||
* before calling wait or wait_all otherwise ORTE_ERROR will be returned.
|
||||
*
|
||||
* @return ORTE_SUCCESS on success
|
||||
* @return ORTE_ERROR on fail
|
||||
*/
|
||||
typedef int (*orte_filem_base_wait_fn_t)
|
||||
(orte_filem_base_request_t *request);
|
||||
|
||||
/**
|
||||
* Wait for a multiple file movement requests to finish
|
||||
*
|
||||
* @param request_list opal_list_t of FileM requests describing the remote files/directories.
|
||||
*
|
||||
* The request must have been passed through one of the non-blocking functions
|
||||
* before calling wait or wait_all otherwise ORTE_ERROR will be returned.
|
||||
*
|
||||
* @return ORTE_SUCCESS on success
|
||||
* @return ORTE_ERROR on fail
|
||||
*/
|
||||
typedef int (*orte_filem_base_wait_all_fn_t)
|
||||
(opal_list_t *request_list);
|
||||
|
||||
/**
|
||||
* Structure for FILEM v1.0.0 components.
|
||||
*/
|
||||
@ -190,11 +342,19 @@ struct orte_filem_base_module_1_0_0_t {
|
||||
|
||||
/** Put a file on the remote machine */
|
||||
orte_filem_base_put_fn_t put;
|
||||
orte_filem_base_put_nb_fn_t put_nb;
|
||||
/** Get a file from the remote machine */
|
||||
orte_filem_base_get_fn_t get;
|
||||
orte_filem_base_get_nb_fn_t get_nb;
|
||||
|
||||
/** Remove a file on the remote machine */
|
||||
orte_filem_base_rm_fn_t rm;
|
||||
orte_filem_base_rm_nb_fn_t rm_nb;
|
||||
|
||||
/** Test functions for the non-blocking versions */
|
||||
orte_filem_base_wait_fn_t wait;
|
||||
orte_filem_base_wait_all_fn_t wait_all;
|
||||
|
||||
};
|
||||
typedef struct orte_filem_base_module_1_0_0_t orte_filem_base_module_1_0_0_t;
|
||||
typedef struct orte_filem_base_module_1_0_0_t orte_filem_base_module_t;
|
||||
|
@ -33,6 +33,10 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ORTE_FILEM_RSH_ASK 0
|
||||
#define ORTE_FILEM_RSH_ALLOW 1
|
||||
#define ORTE_FILEM_RSH_DONE 2
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
@ -49,6 +53,9 @@ extern "C" {
|
||||
typedef struct orte_filem_rsh_component_t orte_filem_rsh_component_t;
|
||||
ORTE_MODULE_DECLSPEC extern orte_filem_rsh_component_t mca_filem_rsh_component;
|
||||
|
||||
extern int orte_filem_rsh_max_incomming;
|
||||
extern int orte_filem_rsh_max_outgoing;
|
||||
|
||||
/*
|
||||
* Module functions
|
||||
*/
|
||||
@ -57,9 +64,17 @@ extern "C" {
|
||||
int orte_filem_rsh_module_init(void);
|
||||
int orte_filem_rsh_module_finalize(void);
|
||||
|
||||
int orte_filem_base_rsh_put(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_rsh_get(orte_filem_base_request_t *request);
|
||||
int orte_filem_base_rsh_rm( orte_filem_base_request_t *request);
|
||||
int orte_filem_rsh_put(orte_filem_base_request_t *request);
|
||||
int orte_filem_rsh_put_nb(orte_filem_base_request_t *request);
|
||||
|
||||
int orte_filem_rsh_get(orte_filem_base_request_t *request);
|
||||
int orte_filem_rsh_get_nb(orte_filem_base_request_t *request);
|
||||
|
||||
int orte_filem_rsh_rm( orte_filem_base_request_t *request);
|
||||
int orte_filem_rsh_rm_nb( orte_filem_base_request_t *request);
|
||||
|
||||
int orte_filem_rsh_wait( orte_filem_base_request_t *request);
|
||||
int orte_filem_rsh_wait_all( opal_list_t *request_list);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
|
@ -34,6 +34,9 @@ const char *orte_filem_rsh_component_version_string =
|
||||
static int filem_rsh_open(void);
|
||||
static int filem_rsh_close(void);
|
||||
|
||||
int orte_filem_rsh_max_incomming = 10;
|
||||
int orte_filem_rsh_max_outgoing = 10;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
@ -119,6 +122,28 @@ static int filem_rsh_open(void)
|
||||
"rsh",
|
||||
&mca_filem_rsh_component.remote_sh_command);
|
||||
|
||||
mca_base_param_reg_int(&mca_filem_rsh_component.super.filem_version,
|
||||
"max_incomming",
|
||||
"Maximum number of incomming connections",
|
||||
false, false,
|
||||
orte_filem_rsh_max_incomming,
|
||||
&orte_filem_rsh_max_incomming);
|
||||
|
||||
if( orte_filem_rsh_max_incomming <= 0 ) {
|
||||
orte_filem_rsh_max_incomming = 1;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_filem_rsh_component.super.filem_version,
|
||||
"max_outgoing",
|
||||
"Maximum number of out going connections (Currently not used)",
|
||||
false, false,
|
||||
orte_filem_rsh_max_outgoing,
|
||||
&orte_filem_rsh_max_outgoing);
|
||||
|
||||
if( orte_filem_rsh_max_outgoing <= 0 ) {
|
||||
orte_filem_rsh_max_outgoing = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -16,6 +16,8 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA += base/help-orte-odls-base.txt
|
||||
|
||||
headers += \
|
||||
base/odls_private.h \
|
||||
base/base.h
|
||||
|
41
orte/mca/odls/base/help-orte-odls-base.txt
Обычный файл
41
orte/mca/odls/base/help-orte-odls-base.txt
Обычный файл
@ -0,0 +1,41 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's ODLS Framework
|
||||
#
|
||||
[orte-odls-base:could-not-preload-binary]
|
||||
WARNING: Could not preload the binary file.
|
||||
|
||||
Binary: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
[orte-odls-base:could-not-preload-files]
|
||||
WARNING: Could not preload the files specified.
|
||||
|
||||
Fileset: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
[orte-odls-base:could-not-preload]
|
||||
WARNING: Could not preload the requested files and directories.
|
||||
|
||||
Fileset: %s
|
||||
Fileset: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
|
||||
[orte-odls-base:preload-file-exists]
|
||||
WARNING: Could not preload specified file: File already exists.
|
||||
|
||||
Fileset: %s
|
||||
Host: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
|
@ -20,6 +20,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
@ -30,6 +34,11 @@
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/dss/dss.h"
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
@ -121,3 +130,239 @@ int orte_odls_base_report_spawn(opal_list_t *children)
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preload all files for a single app context
|
||||
*/
|
||||
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
|
||||
int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_filem_base_request_t *filem_request;
|
||||
orte_filem_base_process_set_t *p_set = NULL;
|
||||
|
||||
/* Sanity Check - Make sure there are files to preload */
|
||||
if(!app_context->preload_binary &&
|
||||
NULL == app_context->preload_files) {
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
|
||||
/* Define the process set */
|
||||
p_set = OBJ_NEW(orte_filem_base_process_set_t);
|
||||
p_set->source.jobid = orte_process_info.gpr_replica->jobid;
|
||||
p_set->source.vpid = orte_process_info.gpr_replica->vpid;
|
||||
p_set->sink.jobid = orte_process_info.my_name->jobid;
|
||||
p_set->sink.vpid = orte_process_info.my_name->vpid;
|
||||
|
||||
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
||||
|
||||
if(app_context->preload_binary) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
|
||||
"%s) Preload Binary...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:could-not-preload-binary",
|
||||
true, app_context->app);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
if( NULL != app_context->preload_files) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
|
||||
"%s) Preload Files... [%s]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
app_context->preload_files));
|
||||
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_files(app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:could-not-preload-files",
|
||||
true, app_context->preload_files);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
/* Actually bring over the files - One app context at a time
|
||||
* JJH: This could be improved for multiple app contexts by making
|
||||
* this a non-blocking filem get and then waiting on all of
|
||||
* the requests for all app contexts.
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_filem.get(filem_request)) ) {
|
||||
opal_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:could-not-preload",
|
||||
true,
|
||||
(app_context->preload_binary ? app_context->app : ""),
|
||||
(NULL != app_context->preload_files ? app_context->preload_files : ""));
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if( NULL != filem_request ) {
|
||||
OBJ_RELEASE(filem_request);
|
||||
filem_request = NULL;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/*
|
||||
* The difference between preloading a file, and a binary file is that
|
||||
* we may need to update the app_context to reflect the placement of the binary file
|
||||
* on the local machine.
|
||||
*/
|
||||
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_bin = NULL;
|
||||
orte_filem_base_file_set_t * f_set = NULL;
|
||||
|
||||
f_set = OBJ_NEW(orte_filem_base_file_set_t);
|
||||
|
||||
/* Local Placement */
|
||||
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
|
||||
if(orte_odls_base_is_preload_local_dup(local_bin, filem_request) ) {
|
||||
goto cleanup;
|
||||
}
|
||||
f_set->local_target = strdup(local_bin);
|
||||
|
||||
/* Remote reference */
|
||||
f_set->remote_target = strdup(context->app);
|
||||
|
||||
/* Flag as a single file */
|
||||
f_set->target_flag = ORTE_FILEM_TYPE_FILE;
|
||||
|
||||
/* Add to the request list */
|
||||
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
|
||||
|
||||
cleanup:
|
||||
/*
|
||||
* Adjust the process name to point to the new local version
|
||||
*/
|
||||
if( NULL != local_bin ) {
|
||||
if(NULL != context->app) {
|
||||
free(context->app);
|
||||
context->app = NULL;
|
||||
}
|
||||
|
||||
context->app = strdup(local_bin);
|
||||
free(local_bin);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_ref = NULL;
|
||||
int i, remote_argc = 0;
|
||||
char **remote_targets = NULL;
|
||||
char * temp = NULL;
|
||||
orte_filem_base_file_set_t * f_set = NULL;
|
||||
|
||||
remote_targets = opal_argv_split(context->preload_files, ',');
|
||||
remote_argc = opal_argv_count(remote_targets);
|
||||
|
||||
for(i = 0; i < remote_argc; ++i) {
|
||||
if(NULL != context->preload_files_dest_dir) {
|
||||
if(context->preload_files_dest_dir[0] == '.') {
|
||||
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
else {
|
||||
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* If the preload_files_dest_dir is not specified
|
||||
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
|
||||
*/
|
||||
if('/' == remote_targets[i][0]) {
|
||||
asprintf(&local_ref, "%s", remote_targets[i]);
|
||||
} else {
|
||||
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
|
||||
asprintf(&temp, "test -e %s", local_ref);
|
||||
if(0 == system(temp)) {
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
opal_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:preload-file-exists",
|
||||
true, local_ref, hostname);
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
|
||||
/*
|
||||
* Is this a duplicate
|
||||
*/
|
||||
if(orte_odls_base_is_preload_local_dup(local_ref, filem_request) ) {
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
f_set = OBJ_NEW(orte_filem_base_file_set_t);
|
||||
|
||||
/* Local Placement */
|
||||
f_set->local_target = strdup(local_ref);
|
||||
|
||||
/* Remote reference */
|
||||
f_set->remote_target = strdup(remote_targets[i]);
|
||||
|
||||
/* Flag as unknown, let FileM figure it out */
|
||||
f_set->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
|
||||
/* Add to the request list */
|
||||
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
|
||||
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
}
|
||||
|
||||
if(NULL != local_ref)
|
||||
free(local_ref);
|
||||
if(NULL != remote_targets)
|
||||
opal_argv_free(remote_targets);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Keeps us from transfering the same file more than once.
|
||||
*/
|
||||
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
opal_list_item_t *item = NULL;
|
||||
|
||||
for (item = opal_list_get_first( &filem_request->file_sets);
|
||||
item != opal_list_get_end( &filem_request->file_sets);
|
||||
item = opal_list_get_next( item) ) {
|
||||
orte_filem_base_file_set_t * f_set = (orte_filem_base_file_set_t*)item;
|
||||
|
||||
if(0 == strncmp(local_ref, f_set->local_target, strlen(local_ref)+1) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -96,6 +96,11 @@ int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_da
|
||||
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* Preload binary/files functions
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_odls_base_preload_files_app_context(orte_app_context_t* context);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -38,33 +38,6 @@ Errno: %d
|
||||
|
||||
This process may still be running and/or consuming resources.
|
||||
|
||||
[orte-pls-fork:could-not-preload-binary]
|
||||
WARNING: Could not preload the binary file.
|
||||
|
||||
Binary: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
[orte-pls-fork:could-not-preload-files]
|
||||
WARNING: Could not preload the files specified.
|
||||
|
||||
Fileset: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
[orte-pls-fork:could-not-preload]
|
||||
WARNING: Could not preload the requested files and directories.
|
||||
|
||||
Fileset: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
|
||||
[orte-pls-fork:preload-file-exists]
|
||||
WARNING: Could not preload specified file: File already exists.
|
||||
|
||||
Fileset: %s
|
||||
Host: %s
|
||||
|
||||
Will continue attempting to launch the process.
|
||||
|
||||
[orte-odls-default:execv-error]
|
||||
Could not execute the executable "%s": %s
|
||||
|
||||
|
@ -90,8 +90,6 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#endif
|
||||
@ -99,12 +97,6 @@
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/odls/default/odls_default.h"
|
||||
|
||||
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
|
||||
|
||||
/*
|
||||
* External Interface
|
||||
*/
|
||||
@ -1160,7 +1152,6 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
opal_list_item_t *item, *item2;
|
||||
bool quit_flag;
|
||||
bool node_included;
|
||||
orte_filem_base_request_t *filem_request;
|
||||
char *job_str, *uri_file, *my_uri, *session_dir=NULL, *slot_str;
|
||||
FILE *fp;
|
||||
|
||||
@ -1414,40 +1405,11 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
item != opal_list_get_end(&app_context_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
app_item = (odls_default_app_context_t*)item;
|
||||
if(app_item->app_context->preload_binary || NULL != app_item->app_context->preload_files) {
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
filem_request->num_procs = 1;
|
||||
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
||||
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
|
||||
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
|
||||
if(app_item->app_context->preload_binary) {
|
||||
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_binary(app_item->app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload-binary",
|
||||
true, app_item->app_context->app);
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
if( NULL != app_item->app_context->preload_files) {
|
||||
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_files(app_item->app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload-files",
|
||||
true, app_item->app_context->preload_files);
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
/* Actually bring over the files */
|
||||
if( ORTE_SUCCESS != (rc = orte_filem.get(filem_request)) ) {
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload",
|
||||
true, opal_argv_join(filem_request->local_targets, ' '));
|
||||
if(app_item->app_context->preload_binary ||
|
||||
NULL != app_item->app_context->preload_files) {
|
||||
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(app_item->app_context)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_DESTRUCT(filem_request);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1745,146 +1707,3 @@ static void set_handler_default(int sig)
|
||||
|
||||
sigaction(sig, &act, (struct sigaction *)0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The difference between preloading a file, and a binary file is that
|
||||
* we may need to update the app_context to reflect the placement of the binary file
|
||||
* on the local machine.
|
||||
*/
|
||||
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_bin = NULL;
|
||||
int tmp_argc = 0;
|
||||
/*
|
||||
* Append the local placement
|
||||
*/
|
||||
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
|
||||
if(is_preload_local_dup(local_bin, filem_request) ) {
|
||||
goto cleanup;
|
||||
}
|
||||
opal_argv_append(&filem_request->num_targets, &(filem_request->local_targets), local_bin);
|
||||
|
||||
/*
|
||||
* Append the remote file
|
||||
*/
|
||||
tmp_argc = 0;
|
||||
opal_argv_append(&tmp_argc, &filem_request->remote_targets, context->app);
|
||||
|
||||
/*
|
||||
* Append the flag
|
||||
*/
|
||||
filem_request->target_flags = (int *)realloc(filem_request->target_flags,
|
||||
sizeof(int) * (filem_request->num_targets + 1));
|
||||
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_FILE;
|
||||
|
||||
cleanup:
|
||||
/*
|
||||
* Adjust the process name
|
||||
*/
|
||||
if(NULL != context->app)
|
||||
free(context->app);
|
||||
context->app = local_bin;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_ref = NULL;
|
||||
int i, tmp_argc = 0, remote_argc = 0;
|
||||
char **remote_targets = NULL;
|
||||
char * temp = NULL;
|
||||
|
||||
remote_targets = opal_argv_split(context->preload_files, ',');
|
||||
remote_argc = opal_argv_count(remote_targets);
|
||||
|
||||
for(i = 0; i < remote_argc; ++i) {
|
||||
if(NULL != context->preload_files_dest_dir) {
|
||||
if(context->preload_files_dest_dir[0] == '.') {
|
||||
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
else {
|
||||
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* If the preload_files_dest_dir is not specified
|
||||
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
|
||||
*/
|
||||
if('/' == remote_targets[i][0]) {
|
||||
asprintf(&local_ref, "%s", remote_targets[i]);
|
||||
} else {
|
||||
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
|
||||
asprintf(&temp, "test -e %s", local_ref);
|
||||
if(0 == system(temp)) {
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
opal_show_help("help-orte-pls-fork.txt",
|
||||
"orte-pls-fork:preload-file-exists",
|
||||
true, local_ref, hostname);
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
|
||||
/*
|
||||
* Is this a duplicate
|
||||
*/
|
||||
if(is_preload_local_dup(local_ref, filem_request) ) {
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append the local files we want
|
||||
*/
|
||||
opal_argv_append(&filem_request->num_targets, &filem_request->local_targets, local_ref);
|
||||
|
||||
/*
|
||||
* Append the remote files we want
|
||||
*/
|
||||
tmp_argc = filem_request->num_targets - 1;
|
||||
opal_argv_append(&tmp_argc, &filem_request->remote_targets, remote_targets[i]);
|
||||
|
||||
/*
|
||||
* Set the flags
|
||||
*/
|
||||
filem_request->target_flags = (int *)realloc(filem_request->target_flags, sizeof(int) * 1);
|
||||
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
}
|
||||
|
||||
if(NULL != local_ref)
|
||||
free(local_ref);
|
||||
if(NULL != remote_targets)
|
||||
opal_argv_free(remote_targets);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Keeps us from transfering the same file more than once.
|
||||
*/
|
||||
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request) {
|
||||
int i;
|
||||
|
||||
for(i = 0; i < filem_request->num_targets; ++i) {
|
||||
if(0 == strncmp(local_ref, filem_request->local_targets[i], strlen(local_ref)+1) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -58,18 +58,11 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/odls/process/odls_process.h"
|
||||
|
||||
static void set_handler_default(int sig);
|
||||
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
|
||||
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request);
|
||||
|
||||
static int orte_odls_process_get_add_procs_data(orte_gpr_notify_data_t **data,
|
||||
orte_job_map_t *map)
|
||||
@ -775,7 +768,6 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
opal_list_item_t *item, *item2;
|
||||
bool quit_flag;
|
||||
bool node_included;
|
||||
orte_filem_base_request_t *filem_request;
|
||||
char *job_str, *uri_file, *my_uri, *session_dir=NULL;
|
||||
FILE *fp;
|
||||
|
||||
@ -1003,40 +995,11 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
item != opal_list_get_end(&app_context_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
app_item = (odls_process_app_context_t*)item;
|
||||
if(app_item->app_context->preload_binary || NULL != app_item->app_context->preload_files) {
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
filem_request->num_procs = 1;
|
||||
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
||||
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
|
||||
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
|
||||
if(app_item->app_context->preload_binary) {
|
||||
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_binary(app_item->app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload-binary",
|
||||
true, app_item->app_context->app);
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
if( NULL != app_item->app_context->preload_files) {
|
||||
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_files(app_item->app_context,
|
||||
filem_request) ) ){
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload-files",
|
||||
true, app_item->app_context->preload_files);
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* Keep accumulating files anyway */
|
||||
}
|
||||
}
|
||||
/* Actually bring over the files */
|
||||
if( ORTE_SUCCESS != (rc = orte_filem.get(filem_request)) ) {
|
||||
opal_show_help("help-orte-odls-default.txt",
|
||||
"orte-odls-default:could-not-preload",
|
||||
true, opal_argv_join(filem_request->local_targets, ' '));
|
||||
if(app_item->app_context->preload_binary ||
|
||||
NULL != app_item->app_context->preload_files) {
|
||||
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(app_item->app_context)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_DESTRUCT(filem_request);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1293,144 +1256,3 @@ orte_odls_base_module_1_3_0_t orte_odls_process_module = {
|
||||
orte_odls_process_kill_local_procs,
|
||||
orte_odls_process_signal_local_proc
|
||||
};
|
||||
/*
|
||||
* The difference between preloading a file, and a binary file is that
|
||||
* we may need to update the app_context to reflect the placement of the binary file
|
||||
* on the local machine.
|
||||
*/
|
||||
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_bin = NULL;
|
||||
int tmp_argc = 0;
|
||||
/*
|
||||
* Append the local placement
|
||||
*/
|
||||
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
|
||||
if(is_preload_local_dup(local_bin, filem_request) ) {
|
||||
goto cleanup;
|
||||
}
|
||||
opal_argv_append(&filem_request->num_targets, &(filem_request->local_targets), local_bin);
|
||||
|
||||
/*
|
||||
* Append the remote file
|
||||
*/
|
||||
tmp_argc = 0;
|
||||
opal_argv_append(&tmp_argc, &filem_request->remote_targets, context->app);
|
||||
|
||||
/*
|
||||
* Append the flag
|
||||
*/
|
||||
filem_request->target_flags = (int *)realloc(filem_request->target_flags,
|
||||
sizeof(int) * (filem_request->num_targets + 1));
|
||||
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_FILE;
|
||||
|
||||
cleanup:
|
||||
/*
|
||||
* Adjust the process name
|
||||
*/
|
||||
if(NULL != context->app)
|
||||
free(context->app);
|
||||
context->app = local_bin;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
||||
orte_filem_base_request_t *filem_request) {
|
||||
char * local_ref = NULL;
|
||||
int i, tmp_argc = 0, remote_argc = 0;
|
||||
char **remote_targets = NULL;
|
||||
char * temp = NULL;
|
||||
|
||||
remote_targets = opal_argv_split(context->preload_files, ',');
|
||||
remote_argc = opal_argv_count(remote_targets);
|
||||
|
||||
for(i = 0; i < remote_argc; ++i) {
|
||||
if(NULL != context->preload_files_dest_dir) {
|
||||
if(context->preload_files_dest_dir[0] == '.') {
|
||||
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
else {
|
||||
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* If the preload_files_dest_dir is not specified
|
||||
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
|
||||
*/
|
||||
if('/' == remote_targets[i][0]) {
|
||||
asprintf(&local_ref, "%s", remote_targets[i]);
|
||||
} else {
|
||||
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
|
||||
}
|
||||
}
|
||||
|
||||
asprintf(&temp, "test -e %s", local_ref);
|
||||
if(0 == system(temp)) {
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
opal_show_help("help-orte-pls-fork.txt",
|
||||
"orte-pls-fork:preload-file-exists",
|
||||
true, local_ref, hostname);
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
free(temp);
|
||||
temp = NULL;
|
||||
|
||||
/*
|
||||
* Is this a duplicate
|
||||
*/
|
||||
if(is_preload_local_dup(local_ref, filem_request) ) {
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append the local files we want
|
||||
*/
|
||||
opal_argv_append(&filem_request->num_targets, &filem_request->local_targets, local_ref);
|
||||
|
||||
/*
|
||||
* Append the remote files we want
|
||||
*/
|
||||
tmp_argc = filem_request->num_targets - 1;
|
||||
opal_argv_append(&tmp_argc, &filem_request->remote_targets, remote_targets[i]);
|
||||
|
||||
/*
|
||||
* Set the flags
|
||||
*/
|
||||
filem_request->target_flags = (int *)realloc(filem_request->target_flags, sizeof(int) * 1);
|
||||
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
|
||||
free(local_ref);
|
||||
local_ref = NULL;
|
||||
}
|
||||
|
||||
if(NULL != local_ref)
|
||||
free(local_ref);
|
||||
if(NULL != remote_targets)
|
||||
opal_argv_free(remote_targets);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Keeps us from transfering the same file more than once.
|
||||
*/
|
||||
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request) {
|
||||
int i;
|
||||
|
||||
for(i = 0; i < filem_request->num_targets; ++i) {
|
||||
if(0 == strncmp(local_ref, filem_request->local_targets[i], strlen(local_ref)+1) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -86,6 +86,8 @@ BEGIN_C_DECLS
|
||||
|
||||
#define ORTE_RML_TAG_COMM_CID_INTRA 28
|
||||
|
||||
/* For FileM RSH Component */
|
||||
#define ORTE_RML_TAG_FILEM_RSH 29
|
||||
/* For CRCP Coord Component */
|
||||
#define OMPI_CRCP_COORD_BOOKMARK_TAG 4242
|
||||
|
||||
|
@ -854,8 +854,12 @@ static int snapc_full_global_gather_all_files(void) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_list_item_t* item = NULL;
|
||||
char * local_dir = NULL;
|
||||
orte_filem_base_request_t *filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
int tmp_argc = 0;
|
||||
orte_filem_base_request_t *filem_request = NULL;
|
||||
orte_filem_base_process_set_t *p_set = NULL;
|
||||
orte_filem_base_file_set_t * f_set = NULL;
|
||||
opal_list_t all_filem_requests;
|
||||
|
||||
OBJ_CONSTRUCT(&all_filem_requests, opal_list_t);
|
||||
|
||||
/*
|
||||
* If it is stored in place, then we do not need to transfer anything
|
||||
@ -893,23 +897,21 @@ static int snapc_full_global_gather_all_files(void) {
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If we just want to pretend to do the filem
|
||||
*/
|
||||
else if(orte_snapc_full_skip_filem) {
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
/*
|
||||
* If *not* stored in place then use FileM to transfer the files and cleanup
|
||||
*/
|
||||
else {
|
||||
|
||||
/*
|
||||
* Allocate the FileM request
|
||||
*/
|
||||
filem_request->num_procs = 1;
|
||||
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
||||
|
||||
filem_request->num_targets = 1;
|
||||
|
||||
filem_request->target_flags = (int*)malloc(sizeof(int) * filem_request->num_targets);
|
||||
filem_request->target_flags[0] = ORTE_FILEM_TYPE_DIR;
|
||||
|
||||
/*
|
||||
* Gather each file
|
||||
* Construct a request for each file/directory to transfer
|
||||
* - start the non-blocking transfer
|
||||
*/
|
||||
for(item = opal_list_get_first(&global_snapshot.snapshots);
|
||||
item != opal_list_get_end(&global_snapshot.snapshots);
|
||||
@ -931,67 +933,100 @@ static int snapc_full_global_gather_all_files(void) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct the process information
|
||||
*/
|
||||
filem_request->proc_name[0].jobid = vpid_snapshot->process_name.jobid;
|
||||
filem_request->proc_name[0].vpid = vpid_snapshot->process_name.vpid;
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
|
||||
/*
|
||||
* Construct the remote file name
|
||||
* Construct the process set
|
||||
*/
|
||||
tmp_argc = 0;
|
||||
opal_argv_append(&tmp_argc, &filem_request->remote_targets, vpid_snapshot->crs_snapshot_super.remote_location);
|
||||
p_set = OBJ_NEW(orte_filem_base_process_set_t);
|
||||
p_set->source.jobid = vpid_snapshot->process_name.jobid;
|
||||
p_set->source.vpid = vpid_snapshot->process_name.vpid;
|
||||
p_set->sink.jobid = orte_process_info.my_name->jobid;
|
||||
p_set->sink.vpid = orte_process_info.my_name->vpid;
|
||||
|
||||
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
||||
|
||||
/*
|
||||
* Construct the local file name
|
||||
* Construct the file set
|
||||
*/
|
||||
tmp_argc = 0;
|
||||
f_set = OBJ_NEW(orte_filem_base_file_set_t);
|
||||
|
||||
local_dir = strdup(vpid_snapshot->crs_snapshot_super.local_location);
|
||||
opal_argv_append(&tmp_argc, &filem_request->local_targets, opal_dirname(local_dir));
|
||||
f_set->local_target = opal_dirname(local_dir);
|
||||
f_set->remote_target = strdup(vpid_snapshot->crs_snapshot_super.remote_location);
|
||||
f_set->target_flag = ORTE_FILEM_TYPE_DIR;
|
||||
|
||||
if( !orte_snapc_full_skip_filem ) {
|
||||
/*
|
||||
* Do the transfer
|
||||
*/
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.get(filem_request) ) ) {
|
||||
exit_status = ret;
|
||||
/* Keep getting all the other files, eventually return an error */
|
||||
goto skip;
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* Update the metadata file
|
||||
*/
|
||||
if(ORTE_SUCCESS != (ret = orte_snapc_base_add_vpid_metadata(&vpid_snapshot->process_name,
|
||||
global_snapshot.reference_name,
|
||||
vpid_snapshot->crs_snapshot_super.reference_name,
|
||||
vpid_snapshot->crs_snapshot_super.local_location))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
|
||||
|
||||
/*
|
||||
* Once we have brought it locally, then remove the remote copy
|
||||
*/
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.rm(filem_request)) ) {
|
||||
exit_status = ret;
|
||||
/* Keep getting all the other files, eventually return an error */
|
||||
}
|
||||
/*
|
||||
* Start the transfer
|
||||
*/
|
||||
opal_list_append(&all_filem_requests, &(filem_request->super));
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.get_nb(filem_request) ) ) {
|
||||
opal_list_remove_item(&all_filem_requests, &(filem_request->super));
|
||||
OBJ_RELEASE(filem_request);
|
||||
filem_request = NULL;
|
||||
|
||||
exit_status = ret;
|
||||
/* Keep getting all the other files, eventually return an error */
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp_argc = 0;
|
||||
opal_argv_delete(&tmp_argc, &filem_request->remote_targets, 0, 1);
|
||||
tmp_argc = 0;
|
||||
opal_argv_delete(&tmp_argc, &filem_request->local_targets, 0, 1);
|
||||
}
|
||||
skip:
|
||||
/* Do a bit of cleanup */
|
||||
opal_argv_free(filem_request->remote_targets);
|
||||
filem_request->remote_targets = NULL;
|
||||
opal_argv_free(filem_request->local_targets);
|
||||
filem_request->local_targets = NULL;
|
||||
|
||||
/*
|
||||
* Wait for all the transfers to complete
|
||||
*/
|
||||
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
||||
"global) Getting remote directory: Waiting...\n");
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.wait_all(&all_filem_requests) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update all of the metadata
|
||||
*/
|
||||
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
||||
"global) Getting remote directory: Updating Metadata...\n");
|
||||
for(item = opal_list_get_first(&global_snapshot.snapshots);
|
||||
item != opal_list_get_end(&global_snapshot.snapshots);
|
||||
item = opal_list_get_next(item) ) {
|
||||
orte_snapc_base_snapshot_t *vpid_snapshot;
|
||||
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
|
||||
|
||||
if(ORTE_SUCCESS != (ret = orte_snapc_base_add_vpid_metadata(&vpid_snapshot->process_name,
|
||||
global_snapshot.reference_name,
|
||||
vpid_snapshot->crs_snapshot_super.reference_name,
|
||||
vpid_snapshot->crs_snapshot_super.local_location))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that the files have been brought local, remove the remote copy
|
||||
*/
|
||||
for(item = opal_list_get_first( &all_filem_requests);
|
||||
item != opal_list_get_end( &all_filem_requests);
|
||||
item = opal_list_get_next( item) ) {
|
||||
filem_request = (orte_filem_base_request_t *) item;
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.rm_nb(filem_request)) ) {
|
||||
exit_status = ret;
|
||||
/* Keep removing, eventually return an error */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for all the removes to complete
|
||||
*/
|
||||
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
||||
"global) Waiting for removes to complete...\n");
|
||||
if(ORTE_SUCCESS != (ret = orte_filem.wait_all(&all_filem_requests) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1003,5 +1038,10 @@ static int snapc_full_global_gather_all_files(void) {
|
||||
if(NULL != local_dir)
|
||||
free(local_dir);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&all_filem_requests) ) ) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&all_filem_requests);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -859,7 +859,7 @@ static int snapc_full_local_start_checkpoint(orte_snapc_base_snapshot_t *vpid_sn
|
||||
local_dir = strdup(vpid_snapshot->crs_snapshot_super.local_location);
|
||||
local_dir = opal_dirname(local_dir);
|
||||
|
||||
asprintf(&command, "opal-checkpoint --where %s --name %s %s %d ",
|
||||
asprintf(&command, "opal-checkpoint -q --where %s --name %s %s %d ",
|
||||
local_dir,
|
||||
vpid_snapshot->crs_snapshot_super.reference_name,
|
||||
term_str, /* If we are to checkpoint then terminate */
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user