665a1e280b
(Someday I'll learn to do this before committing) This commit was SVN r16260. The following SVN revision numbers were found above: r16252 --> open-mpi/ompi@e10f476c87
369 строки
13 KiB
C
369 строки
13 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/orte_constants.h"
|
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
#include <sys/param.h>
|
|
#endif
|
|
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/trace.h"
|
|
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/smr/smr.h"
|
|
#include "orte/dss/dss.h"
|
|
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/basename.h"
|
|
#include "orte/mca/filem/filem.h"
|
|
#include "orte/mca/filem/base/base.h"
|
|
|
|
#include "orte/mca/odls/base/base.h"
|
|
#include "orte/mca/odls/base/odls_private.h"
|
|
|
|
|
|
/*
|
|
* Function for reporting the state and other process-related info
|
|
* for newly spawned child processes
|
|
*/
|
|
int orte_odls_base_report_spawn(opal_list_t *children)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_odls_child_t *child;
|
|
char **tokens, *segment;
|
|
orte_std_cntr_t num_tokens;
|
|
orte_gpr_addr_mode_t mode = ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR;
|
|
orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;
|
|
orte_buffer_t *buffer;
|
|
int rc;
|
|
|
|
buffer = OBJ_NEW(orte_buffer_t);
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.begin_compound_cmd(buffer))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
|
|
for (item = opal_list_get_first(children);
|
|
item != opal_list_get_end(children);
|
|
item = opal_list_get_next(item)) {
|
|
child = (orte_odls_child_t*)item;
|
|
|
|
if (ORTE_PROC_STATE_LAUNCHED == child->state) {
|
|
/* when we launch the child, we need to store the pid
|
|
* in addition to setting the state. Be sure to store
|
|
* the pid first, though, as setting the state can
|
|
* cause triggers to fire
|
|
*/
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&tokens, &num_tokens, child->name))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, child->name->jobid))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
opal_argv_free(tokens);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&(child->pid), ORTE_PID))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
opal_argv_free(tokens);
|
|
free(segment);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.put_1(mode, segment, tokens, ORTE_PROC_LOCAL_PID_KEY, &dval))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
opal_argv_free(tokens);
|
|
free(segment);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
dval.data = NULL;
|
|
opal_argv_free(tokens);
|
|
free(segment);
|
|
|
|
/* now set the process state to LAUNCHED */
|
|
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_LAUNCHED, 0))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
} else if (ORTE_PROC_STATE_FAILED_TO_START == child->state) {
|
|
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_FAILED_TO_START, child->exit_code))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.exec_compound_cmd(buffer))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buffer);
|
|
return rc;
|
|
}
|
|
OBJ_RELEASE(buffer);
|
|
|
|
/* All done */
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Preload all files for a single app context
|
|
*/
|
|
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
|
|
orte_filem_base_request_t *filem_request);
|
|
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
|
|
orte_filem_base_request_t *filem_request);
|
|
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
|
|
orte_filem_base_request_t *filem_request);
|
|
|
|
int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_filem_base_request_t *filem_request;
|
|
orte_filem_base_process_set_t *p_set = NULL;
|
|
|
|
/* Sanity Check - Make sure there are files to preload */
|
|
if(!app_context->preload_binary &&
|
|
NULL == app_context->preload_files) {
|
|
return exit_status;
|
|
}
|
|
|
|
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
|
|
|
/* Define the process set */
|
|
p_set = OBJ_NEW(orte_filem_base_process_set_t);
|
|
p_set->source.jobid = orte_process_info.gpr_replica->jobid;
|
|
p_set->source.vpid = orte_process_info.gpr_replica->vpid;
|
|
p_set->sink.jobid = orte_process_info.my_name->jobid;
|
|
p_set->sink.vpid = orte_process_info.my_name->vpid;
|
|
|
|
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
|
|
|
if(app_context->preload_binary) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
|
|
"%s) Preload Binary...",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_binary(app_context,
|
|
filem_request) ) ){
|
|
opal_show_help("help-orte-odls-base.txt",
|
|
"orte-odls-base:could-not-preload-binary",
|
|
true, app_context->app);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
/* Keep accumulating files anyway */
|
|
}
|
|
}
|
|
if( NULL != app_context->preload_files) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output,
|
|
"%s) Preload Files... [%s]",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
app_context->preload_files));
|
|
if( ORTE_SUCCESS != (ret = orte_odls_base_preload_append_files(app_context,
|
|
filem_request) ) ){
|
|
opal_show_help("help-orte-odls-base.txt",
|
|
"orte-odls-base:could-not-preload-files",
|
|
true, app_context->preload_files);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
/* Keep accumulating files anyway */
|
|
}
|
|
}
|
|
/* Actually bring over the files - One app context at a time
|
|
* JJH: This could be improved for multiple app contexts by making
|
|
* this a non-blocking filem get and then waiting on all of
|
|
* the requests for all app contexts.
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = orte_filem.get(filem_request)) ) {
|
|
opal_show_help("help-orte-odls-base.txt",
|
|
"orte-odls-base:could-not-preload",
|
|
true,
|
|
(app_context->preload_binary ? app_context->app : ""),
|
|
(NULL != app_context->preload_files ? app_context->preload_files : ""));
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
if( NULL != filem_request ) {
|
|
OBJ_RELEASE(filem_request);
|
|
filem_request = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
/*
|
|
* The difference between preloading a file, and a binary file is that
|
|
* we may need to update the app_context to reflect the placement of the binary file
|
|
* on the local machine.
|
|
*/
|
|
static int orte_odls_base_preload_append_binary(orte_app_context_t* context,
|
|
orte_filem_base_request_t *filem_request) {
|
|
char * local_bin = NULL;
|
|
orte_filem_base_file_set_t * f_set = NULL;
|
|
|
|
f_set = OBJ_NEW(orte_filem_base_file_set_t);
|
|
|
|
/* Local Placement */
|
|
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
|
|
if(orte_odls_base_is_preload_local_dup(local_bin, filem_request) ) {
|
|
goto cleanup;
|
|
}
|
|
f_set->local_target = strdup(local_bin);
|
|
|
|
/* Remote reference */
|
|
f_set->remote_target = strdup(context->app);
|
|
|
|
/* Flag as a single file */
|
|
f_set->target_flag = ORTE_FILEM_TYPE_FILE;
|
|
|
|
/* Add to the request list */
|
|
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
|
|
|
|
cleanup:
|
|
/*
|
|
* Adjust the process name to point to the new local version
|
|
*/
|
|
if( NULL != local_bin ) {
|
|
if(NULL != context->app) {
|
|
free(context->app);
|
|
context->app = NULL;
|
|
}
|
|
|
|
context->app = strdup(local_bin);
|
|
free(local_bin);
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
static int orte_odls_base_preload_append_files(orte_app_context_t* context,
|
|
orte_filem_base_request_t *filem_request) {
|
|
char * local_ref = NULL;
|
|
int i, remote_argc = 0;
|
|
char **remote_targets = NULL;
|
|
char * temp = NULL;
|
|
orte_filem_base_file_set_t * f_set = NULL;
|
|
|
|
remote_targets = opal_argv_split(context->preload_files, ',');
|
|
remote_argc = opal_argv_count(remote_targets);
|
|
|
|
for(i = 0; i < remote_argc; ++i) {
|
|
if(NULL != context->preload_files_dest_dir) {
|
|
if(context->preload_files_dest_dir[0] == '.') {
|
|
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
|
}
|
|
else {
|
|
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
|
}
|
|
}
|
|
else {
|
|
/*
|
|
* If the preload_files_dest_dir is not specified
|
|
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
|
|
*/
|
|
if('/' == remote_targets[i][0]) {
|
|
asprintf(&local_ref, "%s", remote_targets[i]);
|
|
} else {
|
|
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
|
|
}
|
|
}
|
|
|
|
asprintf(&temp, "test -e %s", local_ref);
|
|
if(0 == system(temp)) {
|
|
char hostname[MAXHOSTNAMELEN];
|
|
gethostname(hostname, sizeof(hostname));
|
|
opal_show_help("help-orte-odls-base.txt",
|
|
"orte-odls-base:preload-file-exists",
|
|
true, local_ref, hostname);
|
|
free(temp);
|
|
temp = NULL;
|
|
free(local_ref);
|
|
local_ref = NULL;
|
|
continue;
|
|
}
|
|
free(temp);
|
|
temp = NULL;
|
|
|
|
/*
|
|
* Is this a duplicate
|
|
*/
|
|
if(orte_odls_base_is_preload_local_dup(local_ref, filem_request) ) {
|
|
free(local_ref);
|
|
local_ref = NULL;
|
|
continue;
|
|
}
|
|
|
|
f_set = OBJ_NEW(orte_filem_base_file_set_t);
|
|
|
|
/* Local Placement */
|
|
f_set->local_target = strdup(local_ref);
|
|
|
|
/* Remote reference */
|
|
f_set->remote_target = strdup(remote_targets[i]);
|
|
|
|
/* Flag as unknown, let FileM figure it out */
|
|
f_set->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
|
|
|
/* Add to the request list */
|
|
opal_list_append(&(filem_request->file_sets), &(f_set->super) );
|
|
|
|
free(local_ref);
|
|
local_ref = NULL;
|
|
}
|
|
|
|
if(NULL != local_ref)
|
|
free(local_ref);
|
|
if(NULL != remote_targets)
|
|
opal_argv_free(remote_targets);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Keeps us from transfering the same file more than once.
|
|
*/
|
|
static bool orte_odls_base_is_preload_local_dup(char *local_ref,
|
|
orte_filem_base_request_t *filem_request) {
|
|
opal_list_item_t *item = NULL;
|
|
|
|
for (item = opal_list_get_first( &filem_request->file_sets);
|
|
item != opal_list_get_end( &filem_request->file_sets);
|
|
item = opal_list_get_next( item) ) {
|
|
orte_filem_base_file_set_t * f_set = (orte_filem_base_file_set_t*)item;
|
|
|
|
if(0 == strncmp(local_ref, f_set->local_target, strlen(local_ref)+1) ) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|