From 04c69b8a820daa8ba7e965c3d2069ddfb9f96b01 Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Fri, 23 Jan 2009 15:32:24 +0000 Subject: [PATCH] Fixes for --preload-files and --preload-binary. * Improved the error propagation from a backend orted * Fixed a hang in orterun due to failed files transferred * Fix the movement of files with relative path names * Improved error messages when a file cannot be moved * Move file checks to FileM instead of embedding then in the ODLS This commit Refs trac:1770 This commit was SVN r20331. The following Trac tickets were found above: Ticket 1770 --> https://svn.open-mpi.org/trac/ompi/ticket/1770 --- orte/mca/filem/rsh/filem_rsh_module.c | 157 +++++++++++++++++++-- orte/mca/filem/rsh/help-orte-filem-rsh.txt | 26 ++++ orte/mca/odls/base/help-orte-odls-base.txt | 10 +- orte/mca/odls/base/odls_base_default_fns.c | 2 +- orte/mca/odls/base/odls_base_state.c | 27 ++-- 5 files changed, 185 insertions(+), 37 deletions(-) diff --git a/orte/mca/filem/rsh/filem_rsh_module.c b/orte/mca/filem/rsh/filem_rsh_module.c index f0631de697..861257fca2 100644 --- a/orte/mca/filem/rsh/filem_rsh_module.c +++ b/orte/mca/filem/rsh/filem_rsh_module.c @@ -48,6 +48,7 @@ #include "orte/util/proc_info.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" @@ -84,7 +85,11 @@ static void orte_filem_rsh_permission_callback(int status, orte_rml_tag_t tag, void* cbdata); static int orte_filem_rsh_permission_ask(orte_process_name_t* sender, int num_sends); -static int permission_send_done(orte_process_name_t* sender, int num_avail); +static int permission_send_done(orte_process_name_t* sender, int num_avail, + int32_t exit_status, + char * local_target, + char * remote_target, + char * command); static int permission_send_num_allowed(orte_process_name_t* sender, int num_allowed); @@ -260,9 +265,11 @@ int orte_filem_rsh_module_finalize(void) /* * Make sure all active requests are completed */ +#if 0 while(0 < opal_list_get_size(&work_pool_active) ) { ; /* JJH TODO... */ } +#endif /* * Stop the listeners @@ -307,7 +314,7 @@ int orte_filem_rsh_put(orte_filem_base_request_t *request) if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) { opal_output(mca_filem_rsh_component.super.output_handle, - "filem:rsh: put(): Failed to preare the request structure (%d)", ret); + "filem:rsh: put(): Failed to prepare the request structure (%d)", ret); return ret; } @@ -332,7 +339,7 @@ int orte_filem_rsh_put_nb(orte_filem_base_request_t *request) if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_PUT) ) ) { opal_output(mca_filem_rsh_component.super.output_handle, - "filem:rsh: put(): Failed to preare the request structure (%d)", ret); + "filem:rsh: put(): Failed to prepare the request structure (%d)", ret); return ret; } @@ -351,7 +358,7 @@ int orte_filem_rsh_get(orte_filem_base_request_t *request) if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) { opal_output(mca_filem_rsh_component.super.output_handle, - "filem:rsh: get(): Failed to preare the request structure (%d)", ret); + "filem:rsh: get(): Failed to prepare the request structure (%d)", ret); return ret; } @@ -376,7 +383,7 @@ int orte_filem_rsh_get_nb(orte_filem_base_request_t *request) if( ORTE_SUCCESS != (ret = orte_filem_base_prepare_request(request, ORTE_FILEM_MOVE_TYPE_GET) ) ) { opal_output(mca_filem_rsh_component.super.output_handle, - "filem:rsh: get(): Failed to preare the request structure (%d)", ret); + "filem:rsh: get(): Failed to prepare the request structure (%d)", ret); return ret; } @@ -504,7 +511,10 @@ int orte_filem_rsh_wait(orte_filem_base_request_t *request) request->is_active[i] = true; /* Tell peer we are finished with a send */ - permission_send_done(&(wp_item->proc_set.source), 1); + permission_send_done(&(wp_item->proc_set.source), 1, request->exit_status[i], + wp_item->file_set.local_target, + wp_item->file_set.remote_target, + wp_item->command); OBJ_RELEASE(wp_item); wp_item = NULL; @@ -583,6 +593,64 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) { p_item = opal_list_get_next( p_item) ) { orte_filem_base_process_set_t * p_set = (orte_filem_base_process_set_t*)p_item; + + /* + * If the source and sink are the same, then this is a local operation + * Further if the files are the same, then nothing to do + */ + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) { + if( 0 == strncmp(f_set->local_target, f_set->remote_target, strlen(f_set->remote_target) ) ) { + request->is_done[cur_index] = true; + request->is_active[cur_index] = true; + request->exit_status[cur_index] = 0; + goto continue_set; + } + } + + if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) { + /* + * The file should exist if we are going to put it somewhere else + */ + if( 0 != access(f_set->local_target, R_OK) ) { + OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, + "filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Does not exist at source\n", + ORTE_NAME_PRINT(&p_set->source), + ORTE_NAME_PRINT(&p_set->sink), + f_set->local_target, + f_set->remote_target)); + orte_show_help("help-orte-filem-rsh.txt", + "orte-filem-rsh:get-file-not-exist", + true, f_set->local_target, orte_process_info.nodename); + request->is_done[cur_index] = true; + request->is_active[cur_index] = true; + request->exit_status[cur_index] = -1; + goto continue_set; + } + } + /* Do not check a local get() operation, to help supress the warnings from the HNP */ + else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) { + /* + * The file should not exist if we are getting a file with the + * same name since we do not want to overwrite the filename + * without the users consent. + */ + if( 0 == access(f_set->local_target, R_OK) ) { + OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, + "filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n", + ORTE_NAME_PRINT(&p_set->source), + ORTE_NAME_PRINT(&p_set->sink), + f_set->remote_target, + f_set->local_target)); + orte_show_help("help-orte-filem-rsh.txt", + "orte-filem-rsh:get-file-exists", + true, f_set->local_target, orte_process_info.nodename); + request->is_done[cur_index] = true; + request->is_active[cur_index] = true; + request->exit_status[cur_index] = -1; + goto continue_set; + } + } + if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) { OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, "filem:rsh: copy(): %s -> %s: Moving file %s to %s\n", @@ -650,7 +718,11 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) { } else if(ORTE_FILEM_TYPE_UNKNOWN == f_set->target_flag) { opal_output(mca_filem_rsh_component.super.output_handle, - "filem:rsh: copy(): Error: File type unknown"); + "filem:rsh: copy(): Error: File type unknown (%s)", + f_set->remote_target); + request->is_done[cur_index] = true; + request->is_active[cur_index] = true; + request->exit_status[cur_index] = -1; goto continue_set; } else { @@ -1086,6 +1158,7 @@ static void orte_filem_rsh_permission_callback(int status, orte_std_cntr_t n; int num_req, num_allowed = 0; int perm_flag, i; + int32_t peer_status = 0; OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, "filem:rsh: permission_callback(? ?): Peer %s ...", @@ -1218,6 +1291,48 @@ static void orte_filem_rsh_permission_callback(int status, cur_num_incomming -= num_req; + /* + * Receive the exit status + */ + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &peer_status, &n, OPAL_INT32))) { + goto cleanup; + } + + if( peer_status != 0 ) { + char * local_target = NULL; + char * remote_target = NULL; + char * remote_cmd = NULL; + + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &local_target, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + goto cleanup; + } + + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &remote_target, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + goto cleanup; + } + + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &remote_cmd, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + goto cleanup; + } + + orte_show_help("help-orte-filem-rsh.txt", + "orte-filem-rsh:remote-get-failed", + true, ORTE_NAME_PRINT(sender), peer_status, + local_target, + remote_target, + remote_cmd); + free(local_target); + free(remote_target); + free(remote_cmd); + } + /* * For each open slot, notify a waiting peer that it may send */ @@ -1271,7 +1386,11 @@ static int orte_filem_rsh_permission_ask(orte_process_name_t* source, return exit_status; } -static int permission_send_done(orte_process_name_t* peer, int num_avail) { +static int permission_send_done(orte_process_name_t* peer, int num_avail, + int32_t status, + char * local_target, + char * remote_target, + char * command) { int ret, exit_status = ORTE_SUCCESS; opal_buffer_t loc_buffer; int perm_flag = ORTE_FILEM_RSH_DONE; @@ -1288,6 +1407,28 @@ static int permission_send_done(orte_process_name_t* peer, int num_avail) { goto cleanup; } + if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &status, 1, OPAL_INT32))) { + exit_status = ret; + goto cleanup; + } + + if( status != 0 ) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &local_target, 1, OPAL_STRING))) { + exit_status = ret; + goto cleanup; + } + + if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &remote_target, 1, OPAL_STRING))) { + exit_status = ret; + goto cleanup; + } + + if (ORTE_SUCCESS != (ret = opal_dss.pack(&loc_buffer, &command, 1, OPAL_STRING))) { + exit_status = ret; + goto cleanup; + } + } + if (0 > (ret = orte_rml.send_buffer(peer, &loc_buffer, ORTE_RML_TAG_FILEM_RSH, 0))) { exit_status = ret; goto cleanup; diff --git a/orte/mca/filem/rsh/help-orte-filem-rsh.txt b/orte/mca/filem/rsh/help-orte-filem-rsh.txt index 034bf21852..905e1166d9 100644 --- a/orte/mca/filem/rsh/help-orte-filem-rsh.txt +++ b/orte/mca/filem/rsh/help-orte-filem-rsh.txt @@ -18,3 +18,29 @@ # # This is the US/English general help file for ORTE FileM framework. # +[orte-filem-rsh:get-file-exists] +WARNING: Could not preload specified file: File already exists. + +Fileset: %s +Host: %s + +Will continue attempting to launch the process. + +[orte-filem-rsh:put-file-not-exist] +WARNING: Could not preload specified file: File does not exist. + +Fileset: %s +Host: %s + +Will continue attempting to launch the process. + +[orte-filem-rsh:remote-get-failed] +WARNING: Remote peer (%s) failed to preload a file. + +Exit Status: %d +Local File: %s +Remote File: %s +Command: + %s + +Will continue attempting to launch the process(es). diff --git a/orte/mca/odls/base/help-orte-odls-base.txt b/orte/mca/odls/base/help-orte-odls-base.txt index d062eb30a1..582e5ab859 100644 --- a/orte/mca/odls/base/help-orte-odls-base.txt +++ b/orte/mca/odls/base/help-orte-odls-base.txt @@ -26,16 +26,8 @@ Will continue attempting to launch the process. [orte-odls-base:could-not-preload] WARNING: Could not preload the requested files and directories. -Fileset: %s +Binary : %s Fileset: %s Will continue attempting to launch the process. -[orte-odls-base:preload-file-exists] -WARNING: Could not preload specified file: File already exists. - -Fileset: %s -Host: %s - -Will continue attempting to launch the process. - diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index ff8dca92b2..c6ca1c70eb 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1027,7 +1027,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, NULL != apps[i]->preload_files) { if( ORTE_SUCCESS != (rc = orte_odls_base_preload_files_app_context(apps[i])) ) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + /* JJH: Do not fail here, instead try to execute without the preloaded options*/ } } } diff --git a/orte/mca/odls/base/odls_base_state.c b/orte/mca/odls/base/odls_base_state.c index 45e3f0fffd..ac0a2c63e7 100644 --- a/orte/mca/odls/base/odls_base_state.c +++ b/orte/mca/odls/base/odls_base_state.c @@ -89,7 +89,7 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context) "%s) Preload Binary...", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context, - filem_request) ) ){ + filem_request) ) ){ orte_show_help("help-orte-odls-base.txt", "orte-odls-base:could-not-preload-binary", true, app_context->app); @@ -195,7 +195,6 @@ static int orte_odls_base_preload_append_files(orte_app_context_t* context, char * local_ref = NULL; int i, remote_argc = 0; char **remote_targets = NULL; - char * temp = NULL; orte_filem_base_file_set_t * f_set = NULL; remote_targets = opal_argv_split(context->preload_files, ','); @@ -218,26 +217,16 @@ static int orte_odls_base_preload_append_files(orte_app_context_t* context, if('/' == remote_targets[i][0]) { asprintf(&local_ref, "%s", remote_targets[i]); } else { - asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) ); + asprintf(&local_ref, "%s/%s", context->cwd, remote_targets[i] ); + } + + /* If this is the HNP, then source = sink, so use the same path for each local and remote */ + if( orte_process_info.hnp ) { + free(remote_targets[i]); + remote_targets[i] = strdup(local_ref); } } - asprintf(&temp, "test -e %s", local_ref); - if(0 == system(temp)) { - char hostname[MAXHOSTNAMELEN]; - gethostname(hostname, sizeof(hostname)); - orte_show_help("help-orte-odls-base.txt", - "orte-odls-base:preload-file-exists", - true, local_ref, hostname); - free(temp); - temp = NULL; - free(local_ref); - local_ref = NULL; - continue; - } - free(temp); - temp = NULL; - /* * Is this a duplicate */