Fix slave spawn, which was hanging because the local daemon never saw the slave job report - it doesn't do it in the normal way, and so the slave launch system itself has to "fake it".
Also complete implementation to printout app_context objects so we see all the fields. This commit was SVN r21408.
Этот коммит содержится в:
родитель
f24cefe3d2
Коммит
70b8c89b44
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -43,6 +43,9 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/if.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -251,12 +254,22 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
|
||||
exec_path, strerror(errno), errno);
|
||||
exit(-1);
|
||||
} else {
|
||||
/* parent waits to hear that slave is running */
|
||||
ack_recvd = false;
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
|
||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||
/* if it is an orte-job, then parent waits to hear that slave is running. if
|
||||
* it isn't an orte-job, then we can't wait because we would never hear
|
||||
* anything!
|
||||
*/
|
||||
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
|
||||
ack_recvd = false;
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
|
||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||
|
||||
ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
|
||||
/* to release this job from the wait in plm_base_receive, we have to
|
||||
* flag it as having reported
|
||||
*/
|
||||
jdata->num_reported = jdata->num_procs;
|
||||
}
|
||||
|
||||
ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
|
||||
/* cleanup */
|
||||
free(exec_path);
|
||||
opal_argv_free(argv);
|
||||
@ -380,6 +393,9 @@ void orte_plm_base_local_slave_finalize(void)
|
||||
cmd = opal_argv_join(argv, ' ');
|
||||
opal_argv_free(argv);
|
||||
argv = NULL;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave:finalize - removing files with cmd:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd));
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
/* now remove the bootproxy itself, if needed */
|
||||
@ -401,6 +417,9 @@ void orte_plm_base_local_slave_finalize(void)
|
||||
argv = NULL;
|
||||
}
|
||||
/* execute it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave:finalize - removing bootproxy with cmd:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd));
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
}
|
||||
@ -733,7 +752,12 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
|
||||
/* save the bootproxy cmd */
|
||||
slave_node->bootproxy = strdup(rcmd);
|
||||
/* is this a local operation? */
|
||||
if (0 == strcmp(orte_process_info.nodename, nodename)) {
|
||||
if (0 == strcmp(orte_process_info.nodename, nodename) ||
|
||||
0 == strcmp(nodename, "localhost") ||
|
||||
opal_ifislocal(nodename)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave: node %s is local",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
|
||||
slave_node->local = true;
|
||||
/* use the prefix, if given */
|
||||
if (NULL != app->prefix_dir) {
|
||||
@ -744,7 +768,13 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
|
||||
}
|
||||
/* no need to preposition the remote cmd, and no need to remove it */
|
||||
slave_node->positioned = false;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave: setting prefix to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->prefix));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave: node %s is remote",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
|
||||
/* setup the correct shell info */
|
||||
if (ORTE_SUCCESS != (rc = setup_shell(&rshell, &lshell,
|
||||
nodename, &tmpargv))) {
|
||||
@ -879,6 +909,9 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
|
||||
/* put everything in /tmp */
|
||||
dest_dir = "/tmp";
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave: destination dir set to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dest_dir));
|
||||
|
||||
/* setup the exec_path to the bootproxy */
|
||||
if (slave_node->local) {
|
||||
@ -904,6 +937,9 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
|
||||
opal_argv_append_nosize(argv, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:local:slave: exec_path set to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *exec_path));
|
||||
|
||||
/* do we need to preload the binary? */
|
||||
if (app->preload_binary) {
|
||||
@ -1109,6 +1145,13 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
|
||||
char *param, *path, *tmp, *cmd, *basename, *dest_dir;
|
||||
int i;
|
||||
|
||||
/* if a prefix is set, pass it to the bootproxy in a special way */
|
||||
if (NULL != app->prefix_dir) {
|
||||
asprintf(¶m, "OMPI_PREFIX=%s", app->prefix_dir);
|
||||
opal_argv_append_nosize(argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if there is a working directory specified, add it in a special
|
||||
* way so the bootproxy can deal with it
|
||||
*/
|
||||
@ -1262,8 +1305,20 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
|
||||
opal_argv_append_nosize(argv, path);
|
||||
free(path);
|
||||
} else {
|
||||
/* it must already have been put there, so use the given path */
|
||||
opal_argv_append_nosize(argv, app->app);
|
||||
/* it must already have been put there - if the given
|
||||
* path was absolute, just use it
|
||||
*/
|
||||
if (opal_path_is_absolute(app->app)) {
|
||||
opal_argv_append_nosize(argv, app->app);
|
||||
} else if (NULL != app->cwd) {
|
||||
/* prepend the cwd, if provided */
|
||||
param = opal_os_path(false, app->cwd, app->app, NULL);
|
||||
opal_argv_append_nosize(argv, param);
|
||||
free(param);
|
||||
} else {
|
||||
/* just do your best, i guess */
|
||||
opal_argv_append_nosize(argv, app->app);
|
||||
}
|
||||
}
|
||||
|
||||
/* add any provided argv */
|
||||
@ -1271,6 +1326,14 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
|
||||
opal_argv_append_nosize(argv, app->argv[i]);
|
||||
}
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(*argv, ' ');
|
||||
opal_output(0, "%s plm:base:append_bootproxy_args: final argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param);
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -539,7 +539,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
|
||||
tmp = tmp2;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tWorking dir: %s (user: %d)\n%s\tHostfile: %s\tAdd-Hostfile: %s", tmp, pfx2, src->cwd, (int) src->user_specified_cwd,
|
||||
asprintf(&tmp2, "%s\n%s\tWorking dir: %s (user: %d)\n%s\tPrefix: %s\n%s\tHostfile: %s\tAdd-Hostfile: %s", tmp,
|
||||
pfx2, (NULL == src->cwd) ? "NULL" : src->cwd, (int) src->user_specified_cwd,
|
||||
pfx2, (NULL == src->prefix_dir) ? "NULL" : src->prefix_dir,
|
||||
pfx2, (NULL == src->hostfile) ? "NULL" : src->hostfile,
|
||||
(NULL == src->add_hostfile) ? "NULL" : src->add_hostfile);
|
||||
free(tmp);
|
||||
@ -552,6 +554,13 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
|
||||
tmp = tmp2;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tPreload binary: %s\tUsed on node: %s\n%s\tPreload files dest: %s\n%s\tPreload files src dir: %s", tmp,
|
||||
pfx2, (src->preload_binary) ? "TRUE" : "FALSE", (src->used_on_node) ? "TRUE" : "FALSE",
|
||||
pfx2, (NULL == src->preload_files_dest_dir) ? "NULL" : src->preload_files_dest_dir,
|
||||
pfx2, (NULL == src->preload_files_src_dir) ? "NULL" : src->preload_files_src_dir);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
/* set the return */
|
||||
*output = tmp;
|
||||
|
||||
|
@ -11,18 +11,21 @@ int main(int argc, char* argv[])
|
||||
MPI_Info info;
|
||||
int rank, size;
|
||||
pid_t pid;
|
||||
char *host, *app, *rdir, *prefix;
|
||||
char *host, *app, *rdir=NULL, *prefix;
|
||||
char cwd[256];
|
||||
|
||||
if (argc < 5) {
|
||||
printf("Usage: slave_spawn host prefix-for-host abs-path-to-exe remote-tmp-dir <files-to-move>\n");
|
||||
if (argc < 3) {
|
||||
printf("Usage: slave_spawn host prefix-for-host <remote-tmp-dir> <files-to-move>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
host = argv[1];
|
||||
prefix = argv[2];
|
||||
app = argv[3];
|
||||
rdir = argv[4];
|
||||
app = "slave";
|
||||
|
||||
if (5 == argc) {
|
||||
rdir = argv[4];
|
||||
}
|
||||
|
||||
pid = getpid();
|
||||
printf("Slave_spawn [pid %ld] starting up!\n", (long)pid);
|
||||
@ -34,8 +37,11 @@ int main(int argc, char* argv[])
|
||||
MPI_Info_set(info, "host", host);
|
||||
MPI_Info_set(info, "ompi_prefix", prefix);
|
||||
MPI_Info_set(info, "ompi_local_slave", "true");
|
||||
MPI_Info_set(info, "ompi_preload_binary", "true");
|
||||
MPI_Info_set(info, "ompi_preload_files_dest_dir", rdir);
|
||||
|
||||
if (NULL != rdir) {
|
||||
MPI_Info_set(info, "ompi_preload_binary", "true");
|
||||
MPI_Info_set(info, "ompi_preload_files_dest_dir", rdir);
|
||||
}
|
||||
|
||||
if (argc == 6) {
|
||||
/* files were specified */
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights reserved
|
||||
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights reserved
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
|
||||
if (( $# < 1 )) ; then
|
||||
@ -86,4 +87,5 @@ app=$1
|
||||
shift 1
|
||||
|
||||
#exec the app with the remaining args
|
||||
#echo "executing" "$app"
|
||||
exec "$app" "$@"
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user