1
1

Fix slave spawn, which was hanging because the local daemon never saw the slave job report - it doesn't do it in the normal way, and so the slave launch system itself has to "fake it".

Also complete implementation to printout app_context objects so we see all the fields.

This commit was SVN r21408.
Этот коммит содержится в:
Ralph Castain 2009-06-10 19:01:08 +00:00
родитель f24cefe3d2
Коммит 70b8c89b44
4 изменённых файлов: 98 добавлений и 18 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -43,6 +43,9 @@
#include "opal/util/argv.h"
#include "opal/util/basename.h"
#include "opal/util/opal_environ.h"
#include "opal/util/if.h"
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -251,12 +254,22 @@ int orte_plm_base_local_slave_launch(orte_job_t *jdata)
exec_path, strerror(errno), errno);
exit(-1);
} else {
/* parent waits to hear that slave is running */
ack_recvd = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
/* if it is an orte-job, then parent waits to hear that slave is running. if
* it isn't an orte-job, then we can't wait because we would never hear
* anything!
*/
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
ack_recvd = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
/* to release this job from the wait in plm_base_receive, we have to
* flag it as having reported
*/
jdata->num_reported = jdata->num_procs;
}
ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
/* cleanup */
free(exec_path);
opal_argv_free(argv);
@ -380,6 +393,9 @@ void orte_plm_base_local_slave_finalize(void)
cmd = opal_argv_join(argv, ' ');
opal_argv_free(argv);
argv = NULL;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave:finalize - removing files with cmd:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd));
system(cmd);
free(cmd);
/* now remove the bootproxy itself, if needed */
@ -401,6 +417,9 @@ void orte_plm_base_local_slave_finalize(void)
argv = NULL;
}
/* execute it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave:finalize - removing bootproxy with cmd:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd));
system(cmd);
free(cmd);
}
@ -733,7 +752,12 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
/* save the bootproxy cmd */
slave_node->bootproxy = strdup(rcmd);
/* is this a local operation? */
if (0 == strcmp(orte_process_info.nodename, nodename)) {
if (0 == strcmp(orte_process_info.nodename, nodename) ||
0 == strcmp(nodename, "localhost") ||
opal_ifislocal(nodename)) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave: node %s is local",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
slave_node->local = true;
/* use the prefix, if given */
if (NULL != app->prefix_dir) {
@ -744,7 +768,13 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
}
/* no need to preposition the remote cmd, and no need to remove it */
slave_node->positioned = false;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave: setting prefix to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), slave_node->prefix));
} else {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave: node %s is remote",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
/* setup the correct shell info */
if (ORTE_SUCCESS != (rc = setup_shell(&rshell, &lshell,
nodename, &tmpargv))) {
@ -879,6 +909,9 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
/* put everything in /tmp */
dest_dir = "/tmp";
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave: destination dir set to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dest_dir));
/* setup the exec_path to the bootproxy */
if (slave_node->local) {
@ -904,6 +937,9 @@ int orte_plm_base_setup_rsh_launch(char *nodename, orte_app_context_t *app,
opal_argv_append_nosize(argv, tmp);
free(tmp);
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:local:slave: exec_path set to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *exec_path));
/* do we need to preload the binary? */
if (app->preload_binary) {
@ -1109,6 +1145,13 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
char *param, *path, *tmp, *cmd, *basename, *dest_dir;
int i;
/* if a prefix is set, pass it to the bootproxy in a special way */
if (NULL != app->prefix_dir) {
asprintf(&param, "OMPI_PREFIX=%s", app->prefix_dir);
opal_argv_append_nosize(argv, param);
free(param);
}
/* if there is a working directory specified, add it in a special
* way so the bootproxy can deal with it
*/
@ -1262,8 +1305,20 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
opal_argv_append_nosize(argv, path);
free(path);
} else {
/* it must already have been put there, so use the given path */
opal_argv_append_nosize(argv, app->app);
/* it must already have been put there - if the given
* path was absolute, just use it
*/
if (opal_path_is_absolute(app->app)) {
opal_argv_append_nosize(argv, app->app);
} else if (NULL != app->cwd) {
/* prepend the cwd, if provided */
param = opal_os_path(false, app->cwd, app->app, NULL);
opal_argv_append_nosize(argv, param);
free(param);
} else {
/* just do your best, i guess */
opal_argv_append_nosize(argv, app->app);
}
}
/* add any provided argv */
@ -1271,6 +1326,14 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
opal_argv_append_nosize(argv, app->argv[i]);
}
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(*argv, ' ');
opal_output(0, "%s plm:base:append_bootproxy_args: final argv:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param);
if (NULL != param) free(param);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -539,7 +539,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
tmp = tmp2;
}
asprintf(&tmp2, "%s\n%s\tWorking dir: %s (user: %d)\n%s\tHostfile: %s\tAdd-Hostfile: %s", tmp, pfx2, src->cwd, (int) src->user_specified_cwd,
asprintf(&tmp2, "%s\n%s\tWorking dir: %s (user: %d)\n%s\tPrefix: %s\n%s\tHostfile: %s\tAdd-Hostfile: %s", tmp,
pfx2, (NULL == src->cwd) ? "NULL" : src->cwd, (int) src->user_specified_cwd,
pfx2, (NULL == src->prefix_dir) ? "NULL" : src->prefix_dir,
pfx2, (NULL == src->hostfile) ? "NULL" : src->hostfile,
(NULL == src->add_hostfile) ? "NULL" : src->add_hostfile);
free(tmp);
@ -552,6 +554,13 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
tmp = tmp2;
}
asprintf(&tmp2, "%s\n%s\tPreload binary: %s\tUsed on node: %s\n%s\tPreload files dest: %s\n%s\tPreload files src dir: %s", tmp,
pfx2, (src->preload_binary) ? "TRUE" : "FALSE", (src->used_on_node) ? "TRUE" : "FALSE",
pfx2, (NULL == src->preload_files_dest_dir) ? "NULL" : src->preload_files_dest_dir,
pfx2, (NULL == src->preload_files_src_dir) ? "NULL" : src->preload_files_src_dir);
free(tmp);
tmp = tmp2;
/* set the return */
*output = tmp;

Просмотреть файл

@ -11,18 +11,21 @@ int main(int argc, char* argv[])
MPI_Info info;
int rank, size;
pid_t pid;
char *host, *app, *rdir, *prefix;
char *host, *app, *rdir=NULL, *prefix;
char cwd[256];
if (argc < 5) {
printf("Usage: slave_spawn host prefix-for-host abs-path-to-exe remote-tmp-dir <files-to-move>\n");
if (argc < 3) {
printf("Usage: slave_spawn host prefix-for-host <remote-tmp-dir> <files-to-move>\n");
return 1;
}
host = argv[1];
prefix = argv[2];
app = argv[3];
rdir = argv[4];
app = "slave";
if (5 == argc) {
rdir = argv[4];
}
pid = getpid();
printf("Slave_spawn [pid %ld] starting up!\n", (long)pid);
@ -34,8 +37,11 @@ int main(int argc, char* argv[])
MPI_Info_set(info, "host", host);
MPI_Info_set(info, "ompi_prefix", prefix);
MPI_Info_set(info, "ompi_local_slave", "true");
MPI_Info_set(info, "ompi_preload_binary", "true");
MPI_Info_set(info, "ompi_preload_files_dest_dir", rdir);
if (NULL != rdir) {
MPI_Info_set(info, "ompi_preload_binary", "true");
MPI_Info_set(info, "ompi_preload_files_dest_dir", rdir);
}
if (argc == 6) {
/* files were specified */

Просмотреть файл

@ -1,6 +1,7 @@
#!/bin/bash
#
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights reserved
# Copyright (c) 2009 Los Alamos National Security, LLC. All rights reserved
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
if (( $# < 1 )) ; then
@ -86,4 +87,5 @@ app=$1
shift 1
#exec the app with the remaining args
#echo "executing" "$app"
exec "$app" "$@"