1
1

Bring over the jjh-filem branch which contains a non-blocking FileM interface

and implementation. This has shown drastic performance benefit when
transferring Many files at roughly the same time.

I tested this for many different filem operations and everything was working
fine. Let me know if you have any problems with this functionality.

Some Notes:
 - opal-checkpoint now has a 'quiet' flag to keep it from being too verbose.

 - FileM RSH component is fully non-blocking.

 - FileM RSH component has incomming connection throttling since by default
   ssh only allows 10 concurrent scp connections to any single host. This
   default can be adjusted via an MCA parameter.
    {{{-mca filem_rsh_max_incomming 10}}}

 - There is an MCA parameter for max outgoing connections, but it is currently
   not implemented. If someone needs it then it should not be hard to implement.
    {{{-mca filem_rsh_max_outgoing 10}}}

 - Changed the FileM request structure so that it is a bit more explicit and
   flexible.

 - Moved the 'preload-binary' and 'preload-files' functionality into odls/base
   allowing for code reuse in the 'process' and 'default' ODLS components.

 - Fixed a bug in the process name resolution which broke the 'preload-*'
   functionality due to GPR table structure changes.

 - The FileM RSH component might be able to see even more speedup from using a
   thread pool to operate on the work_pool structures, but that is for future
   work.

 - Added a 'opal-show-help' file to ODLS Base

This commit was SVN r16252.
Этот коммит содержится в:
Josh Hursey 2007-09-27 13:13:29 +00:00
родитель 670956e172
Коммит e10f476c87
18 изменённых файлов: 1806 добавлений и 647 удалений

Просмотреть файл

@ -92,6 +92,7 @@ typedef struct {
int pid;
bool term;
bool verbose;
bool quiet;
char *snapshot_name;
char *snapshot_loc;
int output;
@ -112,6 +113,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'q', NULL, "quiet",
0,
&opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
"Be Super Quiet" },
{ NULL, NULL, NULL,
'\0', NULL, "term",
0,
@ -175,10 +182,12 @@ main(int argc, char *argv[])
exit_status = ret;
goto cleanup;
}
opal_output(opal_checkpoint_globals.output,
"Local Snapshot Reference = %s\n",
fname);
if( !opal_checkpoint_globals.quiet ) {
opal_output(opal_checkpoint_globals.output,
"Local Snapshot Reference = %s\n",
fname);
}
cleanup:
/***************
@ -215,6 +224,7 @@ static int initialize(int argc, char *argv[]) {
* Setup OPAL Output handle from the verbose argument
*/
if( opal_checkpoint_globals.verbose ) {
opal_checkpoint_globals.quiet = false; /* Automaticly turn off quiet if it is set */
opal_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
} else {
@ -364,15 +374,21 @@ notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_
*/
if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
/* File doesn't exist yet, keep waiting */
opal_output(0, "opal-checkpoint: File does not exit yet: <%s> rtn = %d (waited %d/%d sec)\n",
prog_named_pipe_r, ret, s, max_wait_time);
if( !opal_checkpoint_globals.quiet &&
s >= max_wait_time - 5 ) {
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
prog_named_pipe_r, ret, s, max_wait_time);
}
sleep(1);
continue;
}
else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
/* File doesn't exist yet, keep waiting */
opal_output(0, "opal-checkpoint: File does not exit yet: <%s> rtn = %d (waited %d/%d sec)\n",
prog_named_pipe_w, ret, s, max_wait_time);
if( !opal_checkpoint_globals.quiet &&
s >= max_wait_time - 5 ) {
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
prog_named_pipe_w, ret, s, max_wait_time);
}
sleep(1);
continue;
}

Просмотреть файл

@ -36,6 +36,12 @@ extern "C" {
/**
* FileM request object maintenance functions
*/
ORTE_DECLSPEC void orte_filem_base_process_set_construct(orte_filem_base_process_set_t *obj);
ORTE_DECLSPEC void orte_filem_base_process_set_destruct( orte_filem_base_process_set_t *obj);
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *obj);
ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t *obj);
ORTE_DECLSPEC void orte_filem_base_construct(orte_filem_base_request_t *obj);
ORTE_DECLSPEC void orte_filem_base_destruct( orte_filem_base_request_t *obj);
@ -89,8 +95,13 @@ extern "C" {
int orte_filem_base_module_finalize(void);
int orte_filem_base_none_put(orte_filem_base_request_t *request);
int orte_filem_base_none_put_nb(orte_filem_base_request_t *request);
int orte_filem_base_none_get(orte_filem_base_request_t *request);
int orte_filem_base_none_get_nb(orte_filem_base_request_t *request);
int orte_filem_base_none_rm( orte_filem_base_request_t *request);
int orte_filem_base_none_rm_nb( orte_filem_base_request_t *request);
int orte_filem_base_none_wait( orte_filem_base_request_t *request);
int orte_filem_base_none_wait_all( opal_list_t *request_list);
/**
* Some utility functions
@ -109,6 +120,11 @@ extern "C" {
orte_rml_tag_t tag,
void* cbdata);
/**
* Setup request structure
*/
ORTE_DECLSPEC int orte_filem_base_prepare_request(orte_filem_base_request_t *request, int move_type);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -48,37 +48,97 @@
/******************
* Object Stuff
******************/
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_process_set_t,
opal_list_item_t,
orte_filem_base_process_set_construct,
orte_filem_base_process_set_destruct);
ORTE_DECLSPEC void orte_filem_base_process_set_construct(orte_filem_base_process_set_t *req) {
req->source = *ORTE_NAME_INVALID;
req->sink = *ORTE_NAME_INVALID;
}
ORTE_DECLSPEC void orte_filem_base_process_set_destruct( orte_filem_base_process_set_t *req) {
req->source = *ORTE_NAME_INVALID;
req->sink = *ORTE_NAME_INVALID;
}
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
opal_list_item_t,
orte_filem_base_file_set_construct,
orte_filem_base_file_set_destruct);
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
req->local_target = NULL;
req->remote_target = NULL;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}
ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t *req) {
if( NULL != req->local_target ) {
free(req->local_target);
req->local_target = NULL;
}
if( NULL != req->remote_target ) {
free(req->remote_target);
req->remote_target = NULL;
}
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_request_t,
opal_list_item_t,
orte_filem_base_construct,
orte_filem_base_destruct);
ORTE_DECLSPEC void orte_filem_base_construct(orte_filem_base_request_t *req) {
req->num_targets = 0;
OBJ_CONSTRUCT(&req->process_sets, opal_list_t);
OBJ_CONSTRUCT(&req->file_sets, opal_list_t);
req->local_targets = NULL;
req->remote_targets = NULL;
req->target_flags = NULL;
req->num_mv = 0;
req->proc_name = NULL;
req->num_procs = 0;
req->is_done = NULL;
req->is_active = NULL;
req->exit_status = NULL;
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
}
ORTE_DECLSPEC void orte_filem_base_destruct( orte_filem_base_request_t *req) {
int i;
opal_list_item_t* item = NULL;
for(i = 0; i < req->num_targets; ++i) {
free(req->local_targets[i]);
free(req->remote_targets[i]);
while( NULL != (item = opal_list_remove_first(&req->process_sets)) ) {
OBJ_RELEASE(item);
}
free(req->target_flags);
req->num_targets = 0;
free(req->proc_name);
req->num_procs = 0;
OBJ_DESTRUCT(&req->process_sets);
while( NULL != (item = opal_list_remove_first(&req->file_sets)) ) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&req->file_sets);
req->num_mv = 0;
if( NULL != req->is_done ) {
free(req->is_done);
req->is_done = NULL;
}
if( NULL != req->is_active ) {
free(req->is_active);
req->is_active = NULL;
}
if( NULL != req->exit_status ) {
free(req->exit_status);
req->exit_status = NULL;
}
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
}
/***********************
@ -109,16 +169,41 @@ int orte_filem_base_none_put(orte_filem_base_request_t *request )
return ORTE_SUCCESS;
}
int orte_filem_base_none_put_nb(orte_filem_base_request_t *request )
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_get(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_get_nb(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_rm(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_rm_nb(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_wait(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_wait_all(opal_list_t *request_list)
{
return ORTE_SUCCESS;
}
/********************
* Utility functions
********************/
@ -156,15 +241,7 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
/*
* Contact GPR and get the 'orte-node-name' for this process
*/
/* if it is the root then we need a different key :/ */
if(proc->jobid == 0 &&
proc->vpid == 0) {
keys[0] = ORTE_PROC_RML_CONTACT_KEY;
}
else {
keys[0] = ORTE_NODE_NAME_KEY;
}
keys[0] = ORTE_NODE_NAME_KEY;
keys[1] = NULL;
/*
@ -223,33 +300,6 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
}
}
if (proc->jobid == 0 && proc->vpid == 0) {
/* we have contact info -- need an IP address. This assumes
that we're using the OOB RML, but that's probably a safe
enough assumption in here. */
char *contact_info = *machine_name;
orte_process_name_t peer;
char **uris;
char *ip, *port;
*machine_name = NULL;
ret = orte_rml_base_parse_uris(contact_info, &peer, &uris);
free(contact_info);
if (ORTE_SUCCESS == ret) {
exit_status = ret;
goto cleanup;
}
ip = strrchr(uris[0], '/') + 1;
port = strrchr(uris[0], ':');
port[0] = '\0';
*machine_name = strdup(ip);
opal_argv_free(uris);
}
if (NULL == *machine_name ){
exit_status = ORTE_ERROR;
goto cleanup;
@ -447,3 +497,49 @@ void orte_filem_base_query_callback(int status,
return;
}
int orte_filem_base_prepare_request(orte_filem_base_request_t *request, int move_type)
{
int num_reqs = 0, i = 0;
if( ORTE_FILEM_MOVE_TYPE_RM == move_type ) {
num_reqs = opal_list_get_size(&request->process_sets);
}
else {
num_reqs = opal_list_get_size(&request->process_sets) * opal_list_get_size(&request->file_sets);
}
if( 0 >= num_reqs ) {
return ORTE_ERROR;
}
else {
if( NULL != request->is_done ) {
free(request->is_done);
request->is_done = NULL;
}
if( NULL != request->is_active ) {
free(request->is_active);
request->is_active = NULL;
}
if( NULL != request->exit_status ) {
free(request->exit_status);
request->exit_status = NULL;
}
request->num_mv = num_reqs;
request->is_done = (bool*) malloc(sizeof(bool) * num_reqs);
request->is_active = (bool*) malloc(sizeof(bool) * num_reqs);
request->exit_status = (int32_t*) malloc(sizeof(int32_t) * num_reqs);
for( i = 0; i < num_reqs; ++i) {
request->is_done[i] = false;
request->is_active[i] = false;
request->exit_status[i] = 0;
}
}
request->movement_type = move_type;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -61,7 +61,7 @@ int orte_filem_base_open(void)
if(0 != value) {
orte_filem_base_output = opal_output_open(NULL);
} else {
orte_filem_base_output = -1;
orte_filem_base_output = 0;
}
/*

Просмотреть файл

@ -47,6 +47,63 @@ extern "C" {
#define ORTE_FILEM_TYPE_DIR 1
#define ORTE_FILEM_TYPE_UNKNOWN 2
/**
* Type of moment
*/
#define ORTE_FILEM_MOVE_TYPE_PUT 0
#define ORTE_FILEM_MOVE_TYPE_GET 1
#define ORTE_FILEM_MOVE_TYPE_RM 2
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
/**
* Define a Process Set
*
* Source: A single source of the operation.
* Sink: Desitination of the operation.
*/
struct orte_filem_base_process_set_1_0_0_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/** Source Process */
orte_process_name_t source;
/** Sink Process */
orte_process_name_t sink;
};
typedef struct orte_filem_base_process_set_1_0_0_t orte_filem_base_process_set_1_0_0_t;
typedef struct orte_filem_base_process_set_1_0_0_t orte_filem_base_process_set_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_filem_base_process_set_t);
/**
* Define a File Pair
*
* Local: Local file reference
* Remove: Remote file reference
*
* Note: If multiple process sinks are used it is assumed that the
* file reference is the same for each of the sinks. If this is not
* true then more than one filem request needs to be created.
*/
struct orte_filem_base_file_set_1_0_0_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/* Local file reference */
char * local_target;
/* Remove file reference */
char * remote_target;
/* Type of file to move */
int target_flag;
};
typedef struct orte_filem_base_file_set_1_0_0_t orte_filem_base_file_set_1_0_0_t;
typedef struct orte_filem_base_file_set_1_0_0_t orte_filem_base_file_set_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_filem_base_file_set_t);
/**
* Definition of a file movement request
* This will allow:
@ -54,33 +111,52 @@ extern "C" {
* - to/from one or more processes
* in a single call of the API function. Allowing the implementation
* to optimize the sending/receiving of data.
* Used for the following:
*
*/
struct orte_filem_base_request_1_0_0_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/** Number of targets in the:
* - local_targets
* - remote_targets
* - target_flags
* arrays. One variable since they
* are required to be all the same length
/*
* A list of process sets
*/
int num_targets;
opal_list_t process_sets;
/** Local mapping of targets */
char **local_targets;
/** Remote mapping of targets */
char **remote_targets;
/*
* A list of file pairings
*/
opal_list_t file_sets;
/** For each target, a flag regarding its type */
int *target_flags;
/*
* Internal use:
* Number of movements
*/
int num_mv;
/** List of processes to send/receive from */
orte_process_name_t *proc_name;
/** Number of processes in that array */
int num_procs;
/*
* Internal use:
* Boolean to indianate if transfer is complete
*/
bool *is_done;
/*
* Internal use:
* Boolean to indianate if transfer is active
*/
bool *is_active;
/*
* Internal use:
* Exit status of the copy command
*/
int32_t *exit_status;
/*
* Internal use:
* Movement type
*/
int movement_type;
};
typedef struct orte_filem_base_request_1_0_0_t orte_filem_base_request_1_0_0_t;
typedef struct orte_filem_base_request_1_0_0_t orte_filem_base_request_t;
@ -125,6 +201,22 @@ typedef int (*orte_filem_base_module_finalize_fn_t)
typedef int (*orte_filem_base_put_fn_t)
(orte_filem_base_request_t *request);
/**
* Put a file or directory on the remote machine (Async)
*
* Note: By using a relative path for the remote file/directory, the filem
* component will negotiate the correct absolute path for that file/directory
* for the remote machine.
*
* @param request FileM request describing the files/directories to send,
* the remote files/directories to use, and the processes to see the change.
*
* @return ORTE_SUCCESS on successful file transer
* @return ORTE_ERROR on failed file transfer
*/
typedef int (*orte_filem_base_put_nb_fn_t)
(orte_filem_base_request_t *request);
/**
* Get a file from the remote machine
*
@ -141,6 +233,22 @@ typedef int (*orte_filem_base_put_fn_t)
typedef int (*orte_filem_base_get_fn_t)
(orte_filem_base_request_t *request);
/**
* Get a file from the remote machine (Async)
*
* Note: By using a relative path for the remote file/directory, the filem
* component will negotiate the correct absolute path for that file/directory
* for the remote machine.
*
* @param request FileM request describing the files/directories to receive,
* the remote files/directories to use, and the processes to see the change.
*
* @return ORTE_SUCCESS on successful file transer
* @return ORTE_ERROR on failed file transfer
*/
typedef int (*orte_filem_base_get_nb_fn_t)
(orte_filem_base_request_t *request);
/**
* Remove a file from the remote machine
*
@ -157,6 +265,50 @@ typedef int (*orte_filem_base_get_fn_t)
typedef int (*orte_filem_base_rm_fn_t)
(orte_filem_base_request_t *request);
/**
* Remove a file from the remote machine (Async)
*
* Note: By using a relative path for the remote file/directory, the filem
* component will negotiate the correct absolute path for that file/directory
* for the remote machine.
*
* @param request FileM request describing the remote files/directories to remove,
* the processes to see the change.
*
* @return ORTE_SUCCESS on success
* @return ORTE_ERROR on fail
*/
typedef int (*orte_filem_base_rm_nb_fn_t)
(orte_filem_base_request_t *request);
/**
* Wait for a single file movement request to finish
*
* @param request FileM request describing the remote files/directories.
*
* The request must have been passed through one of the non-blocking functions
* before calling wait or wait_all otherwise ORTE_ERROR will be returned.
*
* @return ORTE_SUCCESS on success
* @return ORTE_ERROR on fail
*/
typedef int (*orte_filem_base_wait_fn_t)
(orte_filem_base_request_t *request);
/**
* Wait for a multiple file movement requests to finish
*
* @param request_list opal_list_t of FileM requests describing the remote files/directories.
*
* The request must have been passed through one of the non-blocking functions
* before calling wait or wait_all otherwise ORTE_ERROR will be returned.
*
* @return ORTE_SUCCESS on success
* @return ORTE_ERROR on fail
*/
typedef int (*orte_filem_base_wait_all_fn_t)
(opal_list_t *request_list);
/**
* Structure for FILEM v1.0.0 components.
*/
@ -190,11 +342,19 @@ struct orte_filem_base_module_1_0_0_t {
/** Put a file on the remote machine */
orte_filem_base_put_fn_t put;
orte_filem_base_put_nb_fn_t put_nb;
/** Get a file from the remote machine */
orte_filem_base_get_fn_t get;
orte_filem_base_get_nb_fn_t get_nb;
/** Remove a file on the remote machine */
orte_filem_base_rm_fn_t rm;
orte_filem_base_rm_nb_fn_t rm_nb;
/** Test functions for the non-blocking versions */
orte_filem_base_wait_fn_t wait;
orte_filem_base_wait_all_fn_t wait_all;
};
typedef struct orte_filem_base_module_1_0_0_t orte_filem_base_module_1_0_0_t;
typedef struct orte_filem_base_module_1_0_0_t orte_filem_base_module_t;

Просмотреть файл

@ -33,6 +33,10 @@
extern "C" {
#endif
#define ORTE_FILEM_RSH_ASK 0
#define ORTE_FILEM_RSH_ALLOW 1
#define ORTE_FILEM_RSH_DONE 2
/*
* Local Component structures
*/
@ -49,6 +53,9 @@ extern "C" {
typedef struct orte_filem_rsh_component_t orte_filem_rsh_component_t;
ORTE_MODULE_DECLSPEC extern orte_filem_rsh_component_t mca_filem_rsh_component;
extern int orte_filem_rsh_max_incomming;
extern int orte_filem_rsh_max_outgoing;
/*
* Module functions
*/
@ -57,9 +64,17 @@ extern "C" {
int orte_filem_rsh_module_init(void);
int orte_filem_rsh_module_finalize(void);
int orte_filem_base_rsh_put(orte_filem_base_request_t *request);
int orte_filem_base_rsh_get(orte_filem_base_request_t *request);
int orte_filem_base_rsh_rm( orte_filem_base_request_t *request);
int orte_filem_rsh_put(orte_filem_base_request_t *request);
int orte_filem_rsh_put_nb(orte_filem_base_request_t *request);
int orte_filem_rsh_get(orte_filem_base_request_t *request);
int orte_filem_rsh_get_nb(orte_filem_base_request_t *request);
int orte_filem_rsh_rm( orte_filem_base_request_t *request);
int orte_filem_rsh_rm_nb( orte_filem_base_request_t *request);
int orte_filem_rsh_wait( orte_filem_base_request_t *request);
int orte_filem_rsh_wait_all( opal_list_t *request_list);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -34,6 +34,9 @@ const char *orte_filem_rsh_component_version_string =
static int filem_rsh_open(void);
static int filem_rsh_close(void);
int orte_filem_rsh_max_incomming = 10;
int orte_filem_rsh_max_outgoing = 10;
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
@ -119,6 +122,28 @@ static int filem_rsh_open(void)
"rsh",
&mca_filem_rsh_component.remote_sh_command);
mca_base_param_reg_int(&mca_filem_rsh_component.super.filem_version,
"max_incomming",
"Maximum number of incomming connections",
false, false,
orte_filem_rsh_max_incomming,
&orte_filem_rsh_max_incomming);
if( orte_filem_rsh_max_incomming <= 0 ) {
orte_filem_rsh_max_incomming = 1;
}
mca_base_param_reg_int(&mca_filem_rsh_component.super.filem_version,
"max_outgoing",
"Maximum number of out going connections (Currently not used)",
false, false,
orte_filem_rsh_max_outgoing,
&orte_filem_rsh_max_outgoing);
if( orte_filem_rsh_max_outgoing <= 0 ) {
orte_filem_rsh_max_outgoing = 1;
}
/*
* Debug Output
*/

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
@ -16,6 +16,8 @@
# $HEADER$
#
dist_pkgdata_DATA += base/help-orte-odls-base.txt
headers += \
base/odls_private.h \
base/base.h

41
orte/mca/odls/base/help-orte-odls-base.txt Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
# -*- text -*-
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's ODLS Framework
#
[orte-odls-base:could-not-preload-binary]
WARNING: Could not preload the binary file.
Binary: %s
Will continue attempting to launch the process.
[orte-odls-base:could-not-preload-files]
WARNING: Could not preload the files specified.
Fileset: %s
Will continue attempting to launch the process.
[orte-odls-base:could-not-preload]
WARNING: Could not preload the requested files and directories.
Fileset: %s
Fileset: %s
Will continue attempting to launch the process.
[orte-odls-base:preload-file-exists]
WARNING: Could not preload specified file: File already exists.
Fileset: %s
Host: %s
Will continue attempting to launch the process.

Просмотреть файл

@ -20,6 +20,10 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
@ -30,6 +34,11 @@
#include "orte/mca/smr/smr.h"
#include "orte/dss/dss.h"
#include "opal/util/show_help.h"
#include "opal/util/basename.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
@ -121,3 +130,239 @@ int orte_odls_base_report_spawn(opal_list_t *children)
/* All done */
return ORTE_SUCCESS;
}
/*
* Preload all files for a single app context
*/
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
orte_filem_base_request_t *filem_request);
int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
{
int ret, exit_status = ORTE_SUCCESS;
orte_filem_base_request_t *filem_request;
orte_filem_base_process_set_t *p_set = NULL;
/* Sanity Check - Make sure there are files to preload */
if(!app_context->preload_binary &&
NULL == app_context->preload_files) {
return exit_status;
}
filem_request = OBJ_NEW(orte_filem_base_request_t);
/* Define the process set */
p_set = OBJ_NEW(orte_filem_base_process_set_t);
p_set->source.jobid = orte_process_info.gpr_replica->jobid;
p_set->source.vpid = orte_process_info.gpr_replica->vpid;
p_set->sink.jobid = orte_process_info.my_name->jobid;
p_set->sink.vpid = orte_process_info.my_name->vpid;
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
if(app_context->preload_binary) {
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
"%s) Preload Binary...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-base.txt",
"orte-odls-base:could-not-preload-binary",
true, app_context->app);
ORTE_ERROR_LOG(ret);
exit_status = ret;
/* Keep accumulating files anyway */
}
}
if( NULL != app_context->preload_files) {
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
"%s) Preload Files... [%s]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
app_context->preload_files));
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_files(app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-base.txt",
"orte-odls-base:could-not-preload-files",
true, app_context->preload_files);
ORTE_ERROR_LOG(ret);
exit_status = ret;
/* Keep accumulating files anyway */
}
}
/* Actually bring over the files - One app context at a time
* JJH: This could be improved for multiple app contexts by making
* this a non-blocking filem get and then waiting on all of
* the requests for all app contexts.
*/
if( ORTE_SUCCESS != (ret = orte_filem.get(filem_request)) ) {
opal_show_help("help-orte-odls-base.txt",
"orte-odls-base:could-not-preload",
true,
(app_context->preload_binary ? app_context->app : ""),
(NULL != app_context->preload_files ? app_context->preload_files : ""));
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
if( NULL != filem_request ) {
OBJ_RELEASE(filem_request);
filem_request = NULL;
}
return exit_status;
}
/*
* The difference between preloading a file, and a binary file is that
* we may need to update the app_context to reflect the placement of the binary file
* on the local machine.
*/
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_bin = NULL;
orte_filem_base_file_set_t * f_set = NULL;
f_set = OBJ_NEW(orte_filem_base_file_set_t);
/* Local Placement */
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
if(orte_odls_base_is_preload_local_dup(local_bin, filem_request) ) {
goto cleanup;
}
f_set->local_target = strdup(local_bin);
/* Remote reference */
f_set->remote_target = strdup(context->app);
/* Flag as a single file */
f_set->target_flag = ORTE_FILEM_TYPE_FILE;
/* Add to the request list */
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
cleanup:
/*
* Adjust the process name to point to the new local version
*/
if( NULL != local_bin ) {
if(NULL != context->app) {
free(context->app);
context->app = NULL;
}
context->app = strdup(local_bin);
free(local_bin);
}
return ORTE_SUCCESS;
}
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_ref = NULL;
int i, remote_argc = 0;
char **remote_targets = NULL;
char * temp = NULL;
orte_filem_base_file_set_t * f_set = NULL;
remote_targets = opal_argv_split(context->preload_files, ',');
remote_argc = opal_argv_count(remote_targets);
for(i = 0; i < remote_argc; ++i) {
if(NULL != context->preload_files_dest_dir) {
if(context->preload_files_dest_dir[0] == '.') {
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
else {
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
}
else {
/*
* If the preload_files_dest_dir is not specified
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
*/
if('/' == remote_targets[i][0]) {
asprintf(&local_ref, "%s", remote_targets[i]);
} else {
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
}
}
asprintf(&temp, "test -e %s", local_ref);
if(0 == system(temp)) {
char hostname[MAXHOSTNAMELEN];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-orte-odls-base.txt",
"orte-odls-base:preload-file-exists",
true, local_ref, hostname);
free(temp);
temp = NULL;
free(local_ref);
local_ref = NULL;
continue;
}
free(temp);
temp = NULL;
/*
* Is this a duplicate
*/
if(orte_odls_base_is_preload_local_dup(local_ref, filem_request) ) {
free(local_ref);
local_ref = NULL;
continue;
}
f_set = OBJ_NEW(orte_filem_base_file_set_t);
/* Local Placement */
f_set->local_target = strdup(local_ref);
/* Remote reference */
f_set->remote_target = strdup(remote_targets[i]);
/* Flag as unknown, let FileM figure it out */
f_set->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
/* Add to the request list */
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
free(local_ref);
local_ref = NULL;
}
if(NULL != local_ref)
free(local_ref);
if(NULL != remote_targets)
opal_argv_free(remote_targets);
return ORTE_SUCCESS;
}
/*
* Keeps us from transfering the same file more than once.
*/
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
orte_filem_base_request_t *filem_request) {
opal_list_item_t *item = NULL;
for (item = opal_list_get_first( &filem_request->file_sets);
item != opal_list_get_end( &filem_request->file_sets);
item = opal_list_get_next( item) ) {
orte_filem_base_file_set_t * f_set = (orte_filem_base_file_set_t*)item;
if(0 == strncmp(local_ref, f_set->local_target, strlen(local_ref)+1) ) {
return true;
}
}
return false;
}

Просмотреть файл

@ -96,6 +96,11 @@ int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_da
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
/*
* Preload binary/files functions
*/
ORTE_DECLSPEC int orte_odls_base_preload_files_app_context(orte_app_context_t* context);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -38,33 +38,6 @@ Errno: %d
This process may still be running and/or consuming resources.
[orte-pls-fork:could-not-preload-binary]
WARNING: Could not preload the binary file.
Binary: %s
Will continue attempting to launch the process.
[orte-pls-fork:could-not-preload-files]
WARNING: Could not preload the files specified.
Fileset: %s
Will continue attempting to launch the process.
[orte-pls-fork:could-not-preload]
WARNING: Could not preload the requested files and directories.
Fileset: %s
Will continue attempting to launch the process.
[orte-pls-fork:preload-file-exists]
WARNING: Could not preload specified file: File already exists.
Fileset: %s
Host: %s
Will continue attempting to launch the process.
[orte-odls-default:execv-error]
Could not execute the executable "%s": %s

Просмотреть файл

@ -90,8 +90,6 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#if OPAL_ENABLE_FT == 1
#include "orte/mca/snapc/snapc.h"
#endif
@ -99,12 +97,6 @@
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/default/odls_default.h"
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
/*
* External Interface
*/
@ -1160,7 +1152,6 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
opal_list_item_t *item, *item2;
bool quit_flag;
bool node_included;
orte_filem_base_request_t *filem_request;
char *job_str, *uri_file, *my_uri, *session_dir=NULL, *slot_str;
FILE *fp;
@ -1414,40 +1405,11 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
item != opal_list_get_end(&app_context_list);
item = opal_list_get_next(item)) {
app_item = (odls_default_app_context_t*)item;
if(app_item->app_context->preload_binary || NULL != app_item->app_context->preload_files) {
filem_request = OBJ_NEW(orte_filem_base_request_t);
filem_request->num_procs = 1;
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
if(app_item->app_context->preload_binary) {
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_binary(app_item->app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload-binary",
true, app_item->app_context->app);
ORTE_ERROR_LOG(rc);
/* Keep accumulating files anyway */
}
}
if( NULL != app_item->app_context->preload_files) {
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_files(app_item->app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload-files",
true, app_item->app_context->preload_files);
ORTE_ERROR_LOG(rc);
/* Keep accumulating files anyway */
}
}
/* Actually bring over the files */
if( ORTE_SUCCESS != (rc = orte_filem.get(filem_request)) ) {
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload",
true, opal_argv_join(filem_request->local_targets, ' '));
if(app_item->app_context->preload_binary ||
NULL != app_item->app_context->preload_files) {
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(app_item->app_context)) ) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(filem_request);
}
}
@ -1745,146 +1707,3 @@ static void set_handler_default(int sig)
sigaction(sig, &act, (struct sigaction *)0);
}
/*
* The difference between preloading a file, and a binary file is that
* we may need to update the app_context to reflect the placement of the binary file
* on the local machine.
*/
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_bin = NULL;
int tmp_argc = 0;
/*
* Append the local placement
*/
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
if(is_preload_local_dup(local_bin, filem_request) ) {
goto cleanup;
}
opal_argv_append(&filem_request->num_targets, &(filem_request->local_targets), local_bin);
/*
* Append the remote file
*/
tmp_argc = 0;
opal_argv_append(&tmp_argc, &filem_request->remote_targets, context->app);
/*
* Append the flag
*/
filem_request->target_flags = (int *)realloc(filem_request->target_flags,
sizeof(int) * (filem_request->num_targets + 1));
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_FILE;
cleanup:
/*
* Adjust the process name
*/
if(NULL != context->app)
free(context->app);
context->app = local_bin;
return ORTE_SUCCESS;
}
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_ref = NULL;
int i, tmp_argc = 0, remote_argc = 0;
char **remote_targets = NULL;
char * temp = NULL;
remote_targets = opal_argv_split(context->preload_files, ',');
remote_argc = opal_argv_count(remote_targets);
for(i = 0; i < remote_argc; ++i) {
if(NULL != context->preload_files_dest_dir) {
if(context->preload_files_dest_dir[0] == '.') {
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
else {
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
}
else {
/*
* If the preload_files_dest_dir is not specified
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
*/
if('/' == remote_targets[i][0]) {
asprintf(&local_ref, "%s", remote_targets[i]);
} else {
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
}
}
asprintf(&temp, "test -e %s", local_ref);
if(0 == system(temp)) {
char hostname[MAXHOSTNAMELEN];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-orte-pls-fork.txt",
"orte-pls-fork:preload-file-exists",
true, local_ref, hostname);
free(temp);
temp = NULL;
free(local_ref);
local_ref = NULL;
continue;
}
free(temp);
temp = NULL;
/*
* Is this a duplicate
*/
if(is_preload_local_dup(local_ref, filem_request) ) {
free(local_ref);
local_ref = NULL;
continue;
}
/*
* Append the local files we want
*/
opal_argv_append(&filem_request->num_targets, &filem_request->local_targets, local_ref);
/*
* Append the remote files we want
*/
tmp_argc = filem_request->num_targets - 1;
opal_argv_append(&tmp_argc, &filem_request->remote_targets, remote_targets[i]);
/*
* Set the flags
*/
filem_request->target_flags = (int *)realloc(filem_request->target_flags, sizeof(int) * 1);
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_UNKNOWN;
free(local_ref);
local_ref = NULL;
}
if(NULL != local_ref)
free(local_ref);
if(NULL != remote_targets)
opal_argv_free(remote_targets);
return ORTE_SUCCESS;
}
/*
* Keeps us from transfering the same file more than once.
*/
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request) {
int i;
for(i = 0; i < filem_request->num_targets; ++i) {
if(0 == strncmp(local_ref, filem_request->local_targets[i], strlen(local_ref)+1) ) {
return true;
}
}
return false;
}

Просмотреть файл

@ -58,18 +58,11 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/process/odls_process.h"
static void set_handler_default(int sig);
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static int orte_odls_process_get_add_procs_data(orte_gpr_notify_data_t **data,
orte_job_map_t *map)
@ -775,7 +768,6 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
opal_list_item_t *item, *item2;
bool quit_flag;
bool node_included;
orte_filem_base_request_t *filem_request;
char *job_str, *uri_file, *my_uri, *session_dir=NULL;
FILE *fp;
@ -1003,40 +995,11 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
item != opal_list_get_end(&app_context_list);
item = opal_list_get_next(item)) {
app_item = (odls_process_app_context_t*)item;
if(app_item->app_context->preload_binary || NULL != app_item->app_context->preload_files) {
filem_request = OBJ_NEW(orte_filem_base_request_t);
filem_request->num_procs = 1;
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
if(app_item->app_context->preload_binary) {
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_binary(app_item->app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload-binary",
true, app_item->app_context->app);
ORTE_ERROR_LOG(rc);
/* Keep accumulating files anyway */
}
}
if( NULL != app_item->app_context->preload_files) {
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_files(app_item->app_context,
filem_request) ) ){
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload-files",
true, app_item->app_context->preload_files);
ORTE_ERROR_LOG(rc);
/* Keep accumulating files anyway */
}
}
/* Actually bring over the files */
if( ORTE_SUCCESS != (rc = orte_filem.get(filem_request)) ) {
opal_show_help("help-orte-odls-default.txt",
"orte-odls-default:could-not-preload",
true, opal_argv_join(filem_request->local_targets, ' '));
if(app_item->app_context->preload_binary ||
NULL != app_item->app_context->preload_files) {
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(app_item->app_context)) ) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(filem_request);
}
}
@ -1293,144 +1256,3 @@ orte_odls_base_module_1_3_0_t orte_odls_process_module = {
orte_odls_process_kill_local_procs,
orte_odls_process_signal_local_proc
};
/*
* The difference between preloading a file, and a binary file is that
* we may need to update the app_context to reflect the placement of the binary file
* on the local machine.
*/
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_bin = NULL;
int tmp_argc = 0;
/*
* Append the local placement
*/
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
if(is_preload_local_dup(local_bin, filem_request) ) {
goto cleanup;
}
opal_argv_append(&filem_request->num_targets, &(filem_request->local_targets), local_bin);
/*
* Append the remote file
*/
tmp_argc = 0;
opal_argv_append(&tmp_argc, &filem_request->remote_targets, context->app);
/*
* Append the flag
*/
filem_request->target_flags = (int *)realloc(filem_request->target_flags,
sizeof(int) * (filem_request->num_targets + 1));
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_FILE;
cleanup:
/*
* Adjust the process name
*/
if(NULL != context->app)
free(context->app);
context->app = local_bin;
return ORTE_SUCCESS;
}
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request) {
char * local_ref = NULL;
int i, tmp_argc = 0, remote_argc = 0;
char **remote_targets = NULL;
char * temp = NULL;
remote_targets = opal_argv_split(context->preload_files, ',');
remote_argc = opal_argv_count(remote_targets);
for(i = 0; i < remote_argc; ++i) {
if(NULL != context->preload_files_dest_dir) {
if(context->preload_files_dest_dir[0] == '.') {
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
else {
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
}
}
else {
/*
* If the preload_files_dest_dir is not specified
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
*/
if('/' == remote_targets[i][0]) {
asprintf(&local_ref, "%s", remote_targets[i]);
} else {
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
}
}
asprintf(&temp, "test -e %s", local_ref);
if(0 == system(temp)) {
char hostname[MAXHOSTNAMELEN];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-orte-pls-fork.txt",
"orte-pls-fork:preload-file-exists",
true, local_ref, hostname);
free(temp);
temp = NULL;
free(local_ref);
local_ref = NULL;
continue;
}
free(temp);
temp = NULL;
/*
* Is this a duplicate
*/
if(is_preload_local_dup(local_ref, filem_request) ) {
free(local_ref);
local_ref = NULL;
continue;
}
/*
* Append the local files we want
*/
opal_argv_append(&filem_request->num_targets, &filem_request->local_targets, local_ref);
/*
* Append the remote files we want
*/
tmp_argc = filem_request->num_targets - 1;
opal_argv_append(&tmp_argc, &filem_request->remote_targets, remote_targets[i]);
/*
* Set the flags
*/
filem_request->target_flags = (int *)realloc(filem_request->target_flags, sizeof(int) * 1);
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_UNKNOWN;
free(local_ref);
local_ref = NULL;
}
if(NULL != local_ref)
free(local_ref);
if(NULL != remote_targets)
opal_argv_free(remote_targets);
return ORTE_SUCCESS;
}
/*
* Keeps us from transfering the same file more than once.
*/
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request) {
int i;
for(i = 0; i < filem_request->num_targets; ++i) {
if(0 == strncmp(local_ref, filem_request->local_targets[i], strlen(local_ref)+1) ) {
return true;
}
}
return false;
}

Просмотреть файл

@ -86,6 +86,8 @@ BEGIN_C_DECLS
#define ORTE_RML_TAG_COMM_CID_INTRA 28
/* For FileM RSH Component */
#define ORTE_RML_TAG_FILEM_RSH 29
/* For CRCP Coord Component */
#define OMPI_CRCP_COORD_BOOKMARK_TAG 4242

Просмотреть файл

@ -854,8 +854,12 @@ static int snapc_full_global_gather_all_files(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_item_t* item = NULL;
char * local_dir = NULL;
orte_filem_base_request_t *filem_request = OBJ_NEW(orte_filem_base_request_t);
int tmp_argc = 0;
orte_filem_base_request_t *filem_request = NULL;
orte_filem_base_process_set_t *p_set = NULL;
orte_filem_base_file_set_t * f_set = NULL;
opal_list_t all_filem_requests;
OBJ_CONSTRUCT(&all_filem_requests, opal_list_t);
/*
* If it is stored in place, then we do not need to transfer anything
@ -893,23 +897,21 @@ static int snapc_full_global_gather_all_files(void) {
}
}
}
/*
* If we just want to pretend to do the filem
*/
else if(orte_snapc_full_skip_filem) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*
* If *not* stored in place then use FileM to transfer the files and cleanup
*/
else {
/*
* Allocate the FileM request
*/
filem_request->num_procs = 1;
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
filem_request->num_targets = 1;
filem_request->target_flags = (int*)malloc(sizeof(int) * filem_request->num_targets);
filem_request->target_flags[0] = ORTE_FILEM_TYPE_DIR;
/*
* Gather each file
* Construct a request for each file/directory to transfer
* - start the non-blocking transfer
*/
for(item = opal_list_get_first(&global_snapshot.snapshots);
item != opal_list_get_end(&global_snapshot.snapshots);
@ -931,67 +933,100 @@ static int snapc_full_global_gather_all_files(void) {
goto cleanup;
}
/*
* Construct the process information
*/
filem_request->proc_name[0].jobid = vpid_snapshot->process_name.jobid;
filem_request->proc_name[0].vpid = vpid_snapshot->process_name.vpid;
filem_request = OBJ_NEW(orte_filem_base_request_t);
/*
* Construct the remote file name
* Construct the process set
*/
tmp_argc = 0;
opal_argv_append(&tmp_argc, &filem_request->remote_targets, vpid_snapshot->crs_snapshot_super.remote_location);
p_set = OBJ_NEW(orte_filem_base_process_set_t);
p_set->source.jobid = vpid_snapshot->process_name.jobid;
p_set->source.vpid = vpid_snapshot->process_name.vpid;
p_set->sink.jobid = orte_process_info.my_name->jobid;
p_set->sink.vpid = orte_process_info.my_name->vpid;
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
/*
* Construct the local file name
* Construct the file set
*/
tmp_argc = 0;
f_set = OBJ_NEW(orte_filem_base_file_set_t);
local_dir = strdup(vpid_snapshot->crs_snapshot_super.local_location);
opal_argv_append(&tmp_argc, &filem_request->local_targets, opal_dirname(local_dir));
f_set->local_target = opal_dirname(local_dir);
f_set->remote_target = strdup(vpid_snapshot->crs_snapshot_super.remote_location);
f_set->target_flag = ORTE_FILEM_TYPE_DIR;
if( !orte_snapc_full_skip_filem ) {
/*
* Do the transfer
*/
if(ORTE_SUCCESS != (ret = orte_filem.get(filem_request) ) ) {
exit_status = ret;
/* Keep getting all the other files, eventually return an error */
goto skip;
}
else {
/*
* Update the metadata file
*/
if(ORTE_SUCCESS != (ret = orte_snapc_base_add_vpid_metadata(&vpid_snapshot->process_name,
global_snapshot.reference_name,
vpid_snapshot->crs_snapshot_super.reference_name,
vpid_snapshot->crs_snapshot_super.local_location))) {
exit_status = ret;
goto cleanup;
}
}
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
/*
* Once we have brought it locally, then remove the remote copy
*/
if(ORTE_SUCCESS != (ret = orte_filem.rm(filem_request)) ) {
exit_status = ret;
/* Keep getting all the other files, eventually return an error */
}
/*
* Start the transfer
*/
opal_list_append(&all_filem_requests, &(filem_request->super));
if(ORTE_SUCCESS != (ret = orte_filem.get_nb(filem_request) ) ) {
opal_list_remove_item(&all_filem_requests, &(filem_request->super));
OBJ_RELEASE(filem_request);
filem_request = NULL;
exit_status = ret;
/* Keep getting all the other files, eventually return an error */
continue;
}
tmp_argc = 0;
opal_argv_delete(&tmp_argc, &filem_request->remote_targets, 0, 1);
tmp_argc = 0;
opal_argv_delete(&tmp_argc, &filem_request->local_targets, 0, 1);
}
skip:
/* Do a bit of cleanup */
opal_argv_free(filem_request->remote_targets);
filem_request->remote_targets = NULL;
opal_argv_free(filem_request->local_targets);
filem_request->local_targets = NULL;
/*
* Wait for all the transfers to complete
*/
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Getting remote directory: Waiting...\n");
if(ORTE_SUCCESS != (ret = orte_filem.wait_all(&all_filem_requests) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* Update all of the metadata
*/
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Getting remote directory: Updating Metadata...\n");
for(item = opal_list_get_first(&global_snapshot.snapshots);
item != opal_list_get_end(&global_snapshot.snapshots);
item = opal_list_get_next(item) ) {
orte_snapc_base_snapshot_t *vpid_snapshot;
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
if(ORTE_SUCCESS != (ret = orte_snapc_base_add_vpid_metadata(&vpid_snapshot->process_name,
global_snapshot.reference_name,
vpid_snapshot->crs_snapshot_super.reference_name,
vpid_snapshot->crs_snapshot_super.local_location))) {
exit_status = ret;
goto cleanup;
}
}
/*
* Now that the files have been brought local, remove the remote copy
*/
for(item = opal_list_get_first( &all_filem_requests);
item != opal_list_get_end( &all_filem_requests);
item = opal_list_get_next( item) ) {
filem_request = (orte_filem_base_request_t *) item;
if(ORTE_SUCCESS != (ret = orte_filem.rm_nb(filem_request)) ) {
exit_status = ret;
/* Keep removing, eventually return an error */
continue;
}
}
/*
* Wait for all the removes to complete
*/
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Waiting for removes to complete...\n");
if(ORTE_SUCCESS != (ret = orte_filem.wait_all(&all_filem_requests) ) ) {
exit_status = ret;
goto cleanup;
}
}
/*
@ -1003,5 +1038,10 @@ static int snapc_full_global_gather_all_files(void) {
if(NULL != local_dir)
free(local_dir);
while (NULL != (item = opal_list_remove_first(&all_filem_requests) ) ) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&all_filem_requests);
return exit_status;
}

Просмотреть файл

@ -859,7 +859,7 @@ static int snapc_full_local_start_checkpoint(orte_snapc_base_snapshot_t *vpid_sn
local_dir = strdup(vpid_snapshot->crs_snapshot_super.local_location);
local_dir = opal_dirname(local_dir);
asprintf(&command, "opal-checkpoint --where %s --name %s %s %d ",
asprintf(&command, "opal-checkpoint -q --where %s --name %s %s %d ",
local_dir,
vpid_snapshot->crs_snapshot_super.reference_name,
term_str, /* If we are to checkpoint then terminate */