1
1

Fixes for --preload-files and --preload-binary.

* Improved the error propagation from a backend orted
* Fixed a hang in orterun due to failed files transferred
* Fix the movement of files with relative path names
* Improved error messages when a file cannot be moved
* Move file checks to FileM instead of embedding then in the ODLS

This commit Refs trac:1770

This commit was SVN r20331.

The following Trac tickets were found above:
  Ticket 1770 --> https://svn.open-mpi.org/trac/ompi/ticket/1770
Этот коммит содержится в:
Josh Hursey 2009-01-23 15:32:24 +00:00
родитель f9c5adb86f
Коммит 04c69b8a82
5 изменённых файлов: 185 добавлений и 37 удалений

Просмотреть файл

@ -48,6 +48,7 @@
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/filem/filem.h" #include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
@ -84,7 +85,11 @@ static void orte_filem_rsh_permission_callback(int status,
orte_rml_tag_t tag, orte_rml_tag_t tag,
void* cbdata); void* cbdata);
static int orte_filem_rsh_permission_ask(orte_process_name_t* sender, int num_sends); static int orte_filem_rsh_permission_ask(orte_process_name_t* sender, int num_sends);
static int permission_send_done(orte_process_name_t* sender, int num_avail); static int permission_send_done(orte_process_name_t* sender, int num_avail,
int32_t exit_status,
char * local_target,
char * remote_target,
char * command);
static int permission_send_num_allowed(orte_process_name_t* sender, int num_allowed); static int permission_send_num_allowed(orte_process_name_t* sender, int num_allowed);
@ -260,9 +265,11 @@ int orte_filem_rsh_module_finalize(void)
/* /*
* Make sure all active requests are completed * Make sure all active requests are completed
*/ */
#if 0
while(0 < opal_list_get_size(&work_pool_active) ) { while(0 < opal_list_get_size(&work_pool_active) ) {
; /* JJH TODO... */ ; /* JJH TODO... */
} }
#endif
/* /*
* Stop the listeners * Stop the listeners
@ -307,7 +314,7 @@ int orte_filem_rsh_put(orte_filem_base_request_t *request)
if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) { if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) {
opal_output(mca_filem_rsh_component.super.output_handle, opal_output(mca_filem_rsh_component.super.output_handle,
"filem:rsh: put(): Failed to preare the request structure (%d)", ret); "filem:rsh: put(): Failed to prepare the request structure (%d)", ret);
return ret; return ret;
} }
@ -332,7 +339,7 @@ int orte_filem_rsh_put_nb(orte_filem_base_request_t *request)
if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) { if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) {
opal_output(mca_filem_rsh_component.super.output_handle, opal_output(mca_filem_rsh_component.super.output_handle,
"filem:rsh: put(): Failed to preare the request structure (%d)", ret); "filem:rsh: put(): Failed to prepare the request structure (%d)", ret);
return ret; return ret;
} }
@ -351,7 +358,7 @@ int orte_filem_rsh_get(orte_filem_base_request_t *request)
if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) { if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) {
opal_output(mca_filem_rsh_component.super.output_handle, opal_output(mca_filem_rsh_component.super.output_handle,
"filem:rsh: get(): Failed to preare the request structure (%d)", ret); "filem:rsh: get(): Failed to prepare the request structure (%d)", ret);
return ret; return ret;
} }
@ -376,7 +383,7 @@ int orte_filem_rsh_get_nb(orte_filem_base_request_t *request)
if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) { if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) {
opal_output(mca_filem_rsh_component.super.output_handle, opal_output(mca_filem_rsh_component.super.output_handle,
"filem:rsh: get(): Failed to preare the request structure (%d)", ret); "filem:rsh: get(): Failed to prepare the request structure (%d)", ret);
return ret; return ret;
} }
@ -504,7 +511,10 @@ int orte_filem_rsh_wait(orte_filem_base_request_t *request)
request->is_active[i] = true; request->is_active[i] = true;
/* Tell peer we are finished with a send */ /* Tell peer we are finished with a send */
permission_send_done(&(wp_item->proc_set.source), 1); permission_send_done(&(wp_item->proc_set.source), 1, request->exit_status[i],
wp_item->file_set.local_target,
wp_item->file_set.remote_target,
wp_item->command);
OBJ_RELEASE(wp_item); OBJ_RELEASE(wp_item);
wp_item = NULL; wp_item = NULL;
@ -583,6 +593,64 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
p_item = opal_list_get_next( p_item) ) { p_item = opal_list_get_next( p_item) ) {
orte_filem_base_process_set_t * p_set = (orte_filem_base_process_set_t*)p_item; orte_filem_base_process_set_t * p_set = (orte_filem_base_process_set_t*)p_item;
/*
* If the source and sink are the same, then this is a local operation
* Further if the files are the same, then nothing to do
*/
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
if( 0 == strncmp(f_set->local_target, f_set->remote_target, strlen(f_set->remote_target) ) ) {
request->is_done[cur_index] = true;
request->is_active[cur_index] = true;
request->exit_status[cur_index] = 0;
goto continue_set;
}
}
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
/*
* The file should exist if we are going to put it somewhere else
*/
if( 0 != access(f_set->local_target, R_OK) ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Does not exist at source\n",
ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink),
f_set->local_target,
f_set->remote_target));
orte_show_help("help-orte-filem-rsh.txt",
"orte-filem-rsh:get-file-not-exist",
true, f_set->local_target, orte_process_info.nodename);
request->is_done[cur_index] = true;
request->is_active[cur_index] = true;
request->exit_status[cur_index] = -1;
goto continue_set;
}
}
/* Do not check a local get() operation, to help supress the warnings from the HNP */
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
/*
* The file should not exist if we are getting a file with the
* same name since we do not want to overwrite the filename
* without the users consent.
*/
if( 0 == access(f_set->local_target, R_OK) ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n",
ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink),
f_set->remote_target,
f_set->local_target));
orte_show_help("help-orte-filem-rsh.txt",
"orte-filem-rsh:get-file-exists",
true, f_set->local_target, orte_process_info.nodename);
request->is_done[cur_index] = true;
request->is_active[cur_index] = true;
request->exit_status[cur_index] = -1;
goto continue_set;
}
}
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) { if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n", "filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
@ -650,7 +718,11 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
} }
else if(ORTE_FILEM_TYPE_UNKNOWN == f_set->target_flag) { else if(ORTE_FILEM_TYPE_UNKNOWN == f_set->target_flag) {
opal_output(mca_filem_rsh_component.super.output_handle, opal_output(mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): Error: File type unknown"); "filem:rsh: copy(): Error: File type unknown (%s)",
f_set->remote_target);
request->is_done[cur_index] = true;
request->is_active[cur_index] = true;
request->exit_status[cur_index] = -1;
goto continue_set; goto continue_set;
} }
else { else {
@ -1086,6 +1158,7 @@ static void orte_filem_rsh_permission_callback(int status,
orte_std_cntr_t n; orte_std_cntr_t n;
int num_req, num_allowed = 0; int num_req, num_allowed = 0;
int perm_flag, i; int perm_flag, i;
int32_t peer_status = 0;
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: permission_callback(? ?): Peer %s ...", "filem:rsh: permission_callback(? ?): Peer %s ...",
@ -1218,6 +1291,48 @@ static void orte_filem_rsh_permission_callback(int status,
cur_num_incomming -= num_req; cur_num_incomming -= num_req;
/*
* Receive the exit status
*/
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &peer_status, &n, OPAL_INT32))) {
goto cleanup;
}
if( peer_status != 0 ) {
char * local_target = NULL;
char * remote_target = NULL;
char * remote_cmd = NULL;
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &local_target, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &remote_target, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &remote_cmd, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
orte_show_help("help-orte-filem-rsh.txt",
"orte-filem-rsh:remote-get-failed",
true, ORTE_NAME_PRINT(sender), peer_status,
local_target,
remote_target,
remote_cmd);
free(local_target);
free(remote_target);
free(remote_cmd);
}
/* /*
* For each open slot, notify a waiting peer that it may send * For each open slot, notify a waiting peer that it may send
*/ */
@ -1271,7 +1386,11 @@ static int orte_filem_rsh_permission_ask(orte_process_name_t* source,
return exit_status; return exit_status;
} }
static int permission_send_done(orte_process_name_t* peer, int num_avail) { static int permission_send_done(orte_process_name_t* peer, int num_avail,
int32_t status,
char * local_target,
char * remote_target,
char * command) {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t loc_buffer; opal_buffer_t loc_buffer;
int perm_flag = ORTE_FILEM_RSH_DONE; int perm_flag = ORTE_FILEM_RSH_DONE;
@ -1288,6 +1407,28 @@ static int permission_send_done(orte_process_name_t* peer, int num_avail) {
goto cleanup; goto cleanup;
} }
if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &status, 1, OPAL_INT32))) {
exit_status = ret;
goto cleanup;
}
if( status != 0 ) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &local_target, 1, OPAL_STRING))) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &remote_target, 1, OPAL_STRING))) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &command, 1, OPAL_STRING))) {
exit_status = ret;
goto cleanup;
}
}
if (0 > (ret = orte_rml.send_buffer(peer, &loc_buffer, ORTE_RML_TAG_FILEM_RSH, 0))) { if (0 > (ret = orte_rml.send_buffer(peer, &loc_buffer, ORTE_RML_TAG_FILEM_RSH, 0))) {
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;

Просмотреть файл

@ -18,3 +18,29 @@
# #
# This is the US/English general help file for ORTE FileM framework. # This is the US/English general help file for ORTE FileM framework.
# #
[orte-filem-rsh:get-file-exists]
WARNING: Could not preload specified file: File already exists.
Fileset: %s
Host: %s
Will continue attempting to launch the process.
[orte-filem-rsh:put-file-not-exist]
WARNING: Could not preload specified file: File does not exist.
Fileset: %s
Host: %s
Will continue attempting to launch the process.
[orte-filem-rsh:remote-get-failed]
WARNING: Remote peer (%s) failed to preload a file.
Exit Status: %d
Local File: %s
Remote File: %s
Command:
%s
Will continue attempting to launch the process(es).

Просмотреть файл

@ -26,16 +26,8 @@ Will continue attempting to launch the process.
[orte-odls-base:could-not-preload] [orte-odls-base:could-not-preload]
WARNING: Could not preload the requested files and directories. WARNING: Could not preload the requested files and directories.
Fileset: %s Binary : %s
Fileset: %s Fileset: %s
Will continue attempting to launch the process. Will continue attempting to launch the process.
[orte-odls-base:preload-file-exists]
WARNING: Could not preload specified file: File already exists.
Fileset: %s
Host: %s
Will continue attempting to launch the process.

Просмотреть файл

@ -1027,7 +1027,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
NULL != apps[i]->preload_files) { NULL != apps[i]->preload_files) {
if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(apps[i])) ) { if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(apps[i])) ) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto CLEANUP; /* JJH: Do not fail here, instead try to execute without the preloaded options*/
} }
} }
} }

Просмотреть файл

@ -89,7 +89,7 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
"%s) Preload Binary...", "%s) Preload Binary...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context, if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context,
filem_request) ) ){ filem_request) ) ){
orte_show_help("help-orte-odls-base.txt", orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:could-not-preload-binary", "orte-odls-base:could-not-preload-binary",
true, app_context->app); true, app_context->app);
@ -195,7 +195,6 @@ static int orte_odls_base_preload_append_files(orte_app_context_t* context,
char * local_ref = NULL; char * local_ref = NULL;
int i, remote_argc = 0; int i, remote_argc = 0;
char **remote_targets = NULL; char **remote_targets = NULL;
char * temp = NULL;
orte_filem_base_file_set_t * f_set = NULL; orte_filem_base_file_set_t * f_set = NULL;
remote_targets = opal_argv_split(context->preload_files, ','); remote_targets = opal_argv_split(context->preload_files, ',');
@ -218,26 +217,16 @@ static int orte_odls_base_preload_append_files(orte_app_context_t* context,
if('/' == remote_targets[i][0]) { if('/' == remote_targets[i][0]) {
asprintf(&local_ref, "%s", remote_targets[i]); asprintf(&local_ref, "%s", remote_targets[i]);
} else { } else {
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) ); asprintf(&local_ref, "%s/%s", context->cwd, remote_targets[i] );
}
/* If this is the HNP, then source = sink, so use the same path for each local and remote */
if( orte_process_info.hnp ) {
free(remote_targets[i]);
remote_targets[i] = strdup(local_ref);
} }
} }
asprintf(&temp, "test -e %s", local_ref);
if(0 == system(temp)) {
char hostname[MAXHOSTNAMELEN];
gethostname(hostname, sizeof(hostname));
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:preload-file-exists",
true, local_ref, hostname);
free(temp);
temp = NULL;
free(local_ref);
local_ref = NULL;
continue;
}
free(temp);
temp = NULL;
/* /*
* Is this a duplicate * Is this a duplicate
*/ */