Need to revise the display-map-at-launch option so it is active not only for the initial launch, but applies to any subsequent comm_spawn events too.
Add placeholders for the new orte tools. These don't actually do anything yet - in fact, I have set the .ompi_ignore so that you won't compile them (I have set a .ompi_unignore for me). Please let me know if you encounter any trouble with this - the ompi_ignore's should protect everyone. This commit was SVN r12616.
Этот коммит содержится в:
родитель
5ddcb8a652
Коммит
ca5b4358fa
@ -1125,7 +1125,10 @@ AC_CONFIG_FILES([
|
||||
orte/etc/Makefile
|
||||
|
||||
orte/tools/console/Makefile
|
||||
orte/tools/orteboot/Makefile
|
||||
orte/tools/orted/Makefile
|
||||
orte/tools/ortehalt/Makefile
|
||||
orte/tools/ortekill/Makefile
|
||||
orte/tools/orteprobe/Makefile
|
||||
orte/tools/orterun/Makefile
|
||||
orte/tools/wrappers/Makefile
|
||||
|
@ -94,6 +94,10 @@ int orte_odls_base_open(void)
|
||||
orte_odls_globals.output = -1;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("odls_base", "sigkill_timeout",
|
||||
"Time to wait for a process to die after issuing a kill signal to it",
|
||||
false, false, 1, &orte_odls_globals.timeout_before_sigkill);
|
||||
|
||||
/* register the daemon cmd data type */
|
||||
tmp = ORTE_DAEMON_CMD;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_odls_pack_daemon_cmd,
|
||||
|
@ -52,6 +52,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
opal_list_t working_attrs;
|
||||
opal_list_item_t *item;
|
||||
orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID;
|
||||
orte_job_map_t *map;
|
||||
int rc;
|
||||
|
||||
/* check the attributes to see if anything in the environment
|
||||
@ -200,6 +201,13 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
}
|
||||
}
|
||||
|
||||
/* if we wanted to display the map, now is the time to do it */
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
||||
orte_rmaps.get_job_map(&map, job);
|
||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -416,10 +416,6 @@ static int orte_rmgr_proxy_spawn_job(
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
||||
orte_rmaps.get_job_map(&map, *jobid);
|
||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
||||
|
@ -372,10 +372,6 @@ static int orte_rmgr_urm_spawn_job(
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
||||
orte_rmaps.get_job_map(&map, *jobid);
|
||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
||||
|
@ -22,7 +22,10 @@
|
||||
|
||||
SUBDIRS += \
|
||||
tools/console \
|
||||
tools/orteboot \
|
||||
tools/orted \
|
||||
tools/ortehalt \
|
||||
tools/ortekill \
|
||||
tools/orteprobe \
|
||||
tools/orterun \
|
||||
tools/wrappers \
|
||||
@ -31,7 +34,10 @@ SUBDIRS += \
|
||||
|
||||
DIST_SUBDIRS += \
|
||||
tools/console \
|
||||
tools/orteboot \
|
||||
tools/orted \
|
||||
tools/ortehalt \
|
||||
tools/ortekill \
|
||||
tools/orteprobe \
|
||||
tools/orterun \
|
||||
tools/wrappers \
|
||||
|
0
orte/tools/orteboot/.ompi_ignore
Обычный файл
0
orte/tools/orteboot/.ompi_ignore
Обычный файл
1
orte/tools/orteboot/.ompi_unignore
Обычный файл
1
orte/tools/orteboot/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
||||
rhc
|
39
orte/tools/orteboot/Makefile.am
Обычный файл
39
orte/tools/orteboot/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
libs = \
|
||||
$(top_builddir)/orte/liborte.la
|
||||
|
||||
orteboot_SOURCES = \
|
||||
orteboot.c
|
||||
|
||||
orteboot_LDADD = $(libs)
|
||||
orteboot_DEPENDENCIES = $(libs)
|
||||
|
||||
if OMPI_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = orteboot
|
||||
|
||||
dist_pkgdata_DATA = help-orteboot.txt
|
||||
|
||||
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||
man_MANS = orteboot.1
|
||||
EXTRA_DIST = orteboot.1
|
||||
|
||||
endif
|
130
orte/tools/orteboot/help-orteboot.txt
Обычный файл
130
orte/tools/orteboot/help-orteboot.txt
Обычный файл
@ -0,0 +1,130 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orterun:init-failure]
|
||||
Open RTE was unable to initialize properly. The error occured while
|
||||
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:usage]
|
||||
%s (%s) %s
|
||||
|
||||
Usage: %s [OPTION]... [PROGRAM]...
|
||||
Start the given program using Open RTE
|
||||
|
||||
%s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:version]
|
||||
%s (%s) %s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:allocate-resources]
|
||||
%s was unable to allocate enough resources to start your application.
|
||||
This might be a transient error (too many nodes in the cluster were
|
||||
unavailable at the time of the request) or a permenant error (you
|
||||
requsted more nodes than exist in your cluster).
|
||||
|
||||
While probably only useful to Open RTE developers, the error returned
|
||||
was %d.
|
||||
[orterun:error-spawning]
|
||||
%s was unable to start the specified application. An attempt has been
|
||||
made to clean up all processes that did start. The error returned was
|
||||
%d.
|
||||
[orterun:appfile-not-found]
|
||||
Unable to open the appfile:
|
||||
|
||||
%s
|
||||
|
||||
Double check that this file exists and is readable.
|
||||
[orterun:executable-not-specified]
|
||||
No executable was specified on the %s command line.
|
||||
|
||||
Aborting.
|
||||
[orterun:multi-apps-and-zero-np]
|
||||
%s found multiple applications specified on the command line, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
[orterun:nothing-to-do]
|
||||
%s could not find anything to do.
|
||||
|
||||
It is possible that you forgot to specify how many processes to run
|
||||
via the "-np" argument.
|
||||
[orterun:call-failed]
|
||||
%s encountered a %s call failure. This should not happen, and
|
||||
usually indicates an error within the operating system itself.
|
||||
Specifically, the following error occurred:
|
||||
|
||||
%s
|
||||
|
||||
The only other available information that may be helpful is the errno
|
||||
that was returned: %d.
|
||||
[orterun:environ]
|
||||
%s was unable to set
|
||||
%s = %s
|
||||
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:precondition]
|
||||
%s was unable to precondition transports
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:attr-failed]
|
||||
%s was unable to define an attribute
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||
[orterun:abnormal-exit]
|
||||
WARNING: %s encountered an abnormal exit.
|
||||
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
This is a fatal error; %s will now abort. No processes were launched.
|
||||
#
|
||||
[debugger-mca-param-not-found]
|
||||
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||
be found. Please contact the Open RTE developers; this should not
|
||||
happen.
|
||||
#
|
||||
[debugger-orte_base_user_debugger-empty]
|
||||
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||
no user-level debuggers have been defined. Please set this MCA
|
||||
parameter to a value and try again.
|
||||
#
|
||||
[debugger-not-found]
|
||||
A suitable debugger could not be found in your PATH. Check the values
|
||||
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||
debuggers that was searched.
|
||||
#
|
||||
[debugger-exec-failed]
|
||||
%s was unable to launch the specified debugger. This is what was
|
||||
launched:
|
||||
|
||||
%s
|
||||
|
||||
Things to check:
|
||||
|
||||
- Ensure that the debugger is installed properly
|
||||
- Ensure that the "%s" executable is in your path
|
||||
- Ensure that any required licenses are available to run the debugger
|
||||
#
|
||||
[orterun:daemon-die]
|
||||
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||
|
851
orte/tools/orteboot/orteboot.1
Обычный файл
851
orte/tools/orteboot/orteboot.1
Обычный файл
@ -0,0 +1,851 @@
|
||||
.\"
|
||||
.\" Man page for ORTE's orterun command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||
|
||||
.B Note:
|
||||
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||
other. Using any of the names will result in exactly identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.PP
|
||||
Single Process Multiple Data (SPMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ options ]
|
||||
.B <program>
|
||||
.R [ <args> ]
|
||||
.
|
||||
|
||||
Multiple Instruction Multiple Data (MIMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ global_options ]
|
||||
[ local_options1 ]
|
||||
.B <program1>
|
||||
.R [ <args1> ] :
|
||||
[ local_options2 ]
|
||||
.B <program2>
|
||||
.R [ <args2> ] :
|
||||
... :
|
||||
[ local_optionsN ]
|
||||
.B <programN>
|
||||
.R [ <argsN> ]
|
||||
.P
|
||||
|
||||
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||
resides, minus its last subdirectory. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
|
||||
.
|
||||
.\" **************************
|
||||
.\" Quick Summary Section
|
||||
.\" **************************
|
||||
.SH QUICK SUMMARY
|
||||
.
|
||||
If you are simply looking for how to run an MPI application, you
|
||||
probably want to use a command line of the following form:
|
||||
|
||||
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||
|
||||
This will run X copies of \fI<program>\fR in your current run-time
|
||||
environment (if running under a supported resource manager, Open MPI's
|
||||
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||
which require the use of a hostfile, or will default to running all X
|
||||
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||
CPU slot. See the rest of this page for more details.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH OPTIONS
|
||||
.
|
||||
.I mpirun
|
||||
will send the name of the directory where it was invoked on the local
|
||||
node to each of the remote nodes, and attempt to change to that
|
||||
directory. See the "Current Working Directory" section below for further
|
||||
details.
|
||||
.\"
|
||||
.\" Start options listing
|
||||
.\" Indent 10 chacters from start of first column to start of second column
|
||||
.TP 10
|
||||
.B <args>
|
||||
Pass these run-time arguments to every new process. These must always
|
||||
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||
\fI<args>\fP will be ignored.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B <program>
|
||||
The program executable. This is identified as the first non-recognized argument
|
||||
to mpirun.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||
Set the maximum number of aborted processes to display.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --app \fR<appfile>\fP
|
||||
Provide an appfile, ignoring all other command line options.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -bynode\fR,\fP --bynode
|
||||
Allocate (map) the processes by node in a round-robin scheme.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -byslot\fR,\fP --byslot
|
||||
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||
default.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -c \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debug\fR,\fP --debug
|
||||
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||
MCA parameter.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debugger\fR,\fP --debugger
|
||||
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h\fR,\fP --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -H \fR<host1,host2,...,hostN>\fP
|
||||
Synonym for \fI-host\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||
List of hosts on which to invoke processes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||
Provide a hostfile to use.
|
||||
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||
Synonym for \fI-hostfile\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca\fR,\fP --mca <key> <value>
|
||||
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -n\fR,\fP --n \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nolocal\fR,\fP --nolocal
|
||||
Do not run any copies of the launched application on the same node as
|
||||
orterun is running. This option will override listing the localhost
|
||||
with \fB--host\fR or any other host-specifying mechanism.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||
Do not oversubscribe any nodes; error (without starting any processes)
|
||||
if the requested number of processes would cause oversubscription.
|
||||
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||
each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -np \fR<#>\fP
|
||||
Run this many copies of the program on the given nodes. This option
|
||||
indicates that the specified file is an executable program and not an
|
||||
application context. If no value is provided for the number of copies to
|
||||
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||
line), Open MPI will automatically execute a copy of the program on
|
||||
each process slot (see below for description of a "process slot"). This
|
||||
feature, however, can only be used in the SPMD model and will return an
|
||||
error (without beginning execution of the application) otherwise.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nw\fR,\fP --nw
|
||||
Launch the processes and do not wait for their completion. mpirun will
|
||||
complete as soon as successful launch occurs.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -path\fR,\fP --path \fR<path>\fP
|
||||
<path> that will be used when attempting to locate requested executables.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --prefix \fR<dir>\fP
|
||||
Prefix directory that will be used to set the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||
the target process. See the "Remote Execution" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -q\fR,\fP --quiet
|
||||
Suppress informative messages from orterun during application execution.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --tmpdir \fR<dir>\fP
|
||||
Set the root for the session directory tree for mpirun only.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -tv\fR,\fP --tv
|
||||
Launch processes under the TotalView debugger.
|
||||
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --universe \fR<username@hostname:universe_name>\fP
|
||||
For this application, set the universe name as:
|
||||
username@hostname:universe_name
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v\fR,\fP --verbose
|
||||
Be verbose
|
||||
.TP
|
||||
.B -V\fR,\fP --version
|
||||
Print version number. If no other arguments are given, this will also
|
||||
cause orterun to exit.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -wd \fR<dir>\fP
|
||||
Change to the directory <dir> before the user's program executes.
|
||||
See the "Current Working Directory" section for notes on relative paths.
|
||||
.B Note:
|
||||
If the \fI-wd\fP option appears both on the command line and in an
|
||||
application context, the context will take precedence over the command line.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -x \fR<env>\fP
|
||||
Export the specified environment variables to the remote nodes before
|
||||
executing the program. Existing environment variables can be
|
||||
specified (see the Examples section, below), or new variable names
|
||||
specified with corresponding values. The parser for the \fI-x\fP
|
||||
option is not very sophisticated; it does not even understand quoted
|
||||
values. Users are advised to set variables in the environment, and
|
||||
then use \fI-x\fP to export (not define) them.
|
||||
.
|
||||
.
|
||||
.P
|
||||
The following options are useful for developers; they are not generally
|
||||
useful to most ORTE and/or MPI users:
|
||||
.
|
||||
.TP
|
||||
.B -d\fR,\fP --debug-devel
|
||||
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||
This is not generally useful for most users.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons
|
||||
Enable debugging of any OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons-file
|
||||
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||
output in files.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --no-daemonize
|
||||
Do not detach OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||
MPI. If the application is single process multiple data (SPMD), the application
|
||||
can be specified on the \fImpirun\fP command line.
|
||||
|
||||
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||
multiple programs, the set of programs and argument can be specified in one of
|
||||
two ways: Extended Command Line Arguments, and Application Context.
|
||||
.PP
|
||||
An application context describes the MIMD program set including all arguments
|
||||
in a separate file.
|
||||
.\"See appcontext(5) for a description of the application context syntax.
|
||||
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||
command name itself. The ability to specify different options for different
|
||||
instantiations of a program is another reason to use an application context.
|
||||
.PP
|
||||
Extended command line arguments allow for the description of the application
|
||||
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||
of programs and arguments. Some options are globally set across all specified
|
||||
programs (e.g. --hostfile), while others are specific to a single program
|
||||
(e.g. -np).
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Slots
|
||||
.
|
||||
Open MPI uses "slots" to represent a potential location for a process.
|
||||
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||
that node. For performance, the community typically equates a "slot"
|
||||
with a physical CPU, thus ensuring that any process assigned to that
|
||||
slot has a dedicated processor. This is not, however, a requirement for
|
||||
the operation of Open MPI.
|
||||
.PP
|
||||
Slots can be specified in hostfiles after the hostname. For example:
|
||||
.
|
||||
.TP 4
|
||||
host1.example.com slots=4
|
||||
Indicates that there are 4 process slots on host1.
|
||||
.
|
||||
.PP
|
||||
If no slots value is specified, then Open MPI will automatically assign
|
||||
a default value of "slots=1" to that host.
|
||||
.
|
||||
.PP
|
||||
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||
MPI will obtain both the hostnames and the number of slots directly
|
||||
from the resource manger. For example, if running under a SLURM job,
|
||||
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||
to the job as well as how many slots on each node that SLURM says
|
||||
are usable - in most high-performance environments, the slots will
|
||||
equate to the number of processors on the node.
|
||||
.
|
||||
.PP
|
||||
When deciding where to launch processes, Open MPI will first fill up
|
||||
all available slots before oversubscribing (see "Location
|
||||
Nomenclature", below, for more details on the scheduling algorithms
|
||||
available). Unless told otherwise, Open MPI will arbitrarily
|
||||
oversubscribe nodes. For example, if the only node available is the
|
||||
localhost, Open MPI will run as many processes as specified by the
|
||||
-n (or one of its variants) command line option on the
|
||||
localhost (although they may run quite slowly, since they'll all be
|
||||
competing for CPU and other resources).
|
||||
.
|
||||
.PP
|
||||
Limits can be placed on oversubscription with the "max_slots"
|
||||
attribute in the hostfile. For example:
|
||||
.
|
||||
.TP 4
|
||||
host2.example.com slots=4 max_slots=6
|
||||
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||
is limited to launching a maximum of 6 processes on host2.
|
||||
.
|
||||
.TP
|
||||
host3.example.com slots=2 max_slots=2
|
||||
Indicates that there are 2 process slots on host3 and that no
|
||||
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||
option).
|
||||
.
|
||||
.TP
|
||||
host4.example.com max_slots=2
|
||||
Shorthand; same as listing "slots=2 max_slots=2".
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Note that Open MPI's support for resource managers does not currently
|
||||
set the "max_slots" values for hosts. If you wish to prevent
|
||||
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||
option.
|
||||
.
|
||||
.PP
|
||||
In scenarios where the user wishes to launch an application across
|
||||
all available slots by not providing a "-n" option on the mpirun
|
||||
command line, Open MPI will launch a process on each process slot
|
||||
for each host within the provided environment. For example, if a
|
||||
hostfile has been provided, then Open MPI will spawn processes
|
||||
on each identified host up to the "slots=x" limit if oversubscription
|
||||
is not allowed. If oversubscription is allowed (the default), then
|
||||
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||
mapping directives will be enforced to ensure proper placement of
|
||||
process ranks.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Location Nomenclature
|
||||
.
|
||||
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||
the current Open MPI universe. Locations can be specified either by
|
||||
CPU or by node.
|
||||
|
||||
.B Note:
|
||||
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||
specifying a location "by CPU" is really a convenience mechanism for
|
||||
SMPs that ultimately maps down to a specific node.
|
||||
.PP
|
||||
Specifying locations by node will launch one copy of an executable per
|
||||
specified node.
|
||||
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||
node before allocating resources on the next available node.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun --bynode -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||
are available on each of the nodes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --byslot -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on each slot on a given node before running the executable on other available
|
||||
nodes.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Specifying Hosts
|
||||
.
|
||||
Hosts can be specified in a number of ways. The most common of which is in a
|
||||
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||
.
|
||||
.
|
||||
|
||||
\fBshell$\fP cat my-hostfile
|
||||
node00 slots=2
|
||||
node01 slots=2
|
||||
node02 slots=2
|
||||
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --hostfile my-hostfile -np 3 a.out
|
||||
This will run one copy of the executable
|
||||
.I a.out
|
||||
on hosts node00,node01, and node02.
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Another method for specifying hosts is directly on the command line. Here can
|
||||
can include and exclude hosts from the set of hosts to run on. For example:
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host a.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a,b,c a.out
|
||||
Runs one copy of the executable
|
||||
.I a.out
|
||||
on hosts a, b, and c.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host node00.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||
abort.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||
Runs one copy of the executable
|
||||
.I hostname
|
||||
on host a. And runs one copy of the executable
|
||||
.I uptime
|
||||
on hosts b and c.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Local Launch
|
||||
.
|
||||
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||
launch any of the application processes on the same node that orterun
|
||||
is running. While orterun typically blocks and consumes few system
|
||||
resources, this option can be helpful for launching very large jobs
|
||||
where orterun may actually need to use noticable amounts of memory
|
||||
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||
sharing the local node with the launched applications, and likewise
|
||||
allows the launched applications to run unhindered by orterun's system
|
||||
usage.
|
||||
.PP
|
||||
Note that \fB--nolocal\fR will override any other specification to
|
||||
launch the application on the local node. It will disqualify the
|
||||
localhost from being capable of running any processes in the
|
||||
application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||
This example will result in an error because orterun will not find
|
||||
anywhere to launch the application.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Oversubscription
|
||||
.
|
||||
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||
set the "max_slots" value to be the same as the "slots" value for each
|
||||
node. This can be especially helpful when running jobs under a
|
||||
resource manager because Open MPI currently only sets the "slots"
|
||||
value for each node that it obtains from the resource manager.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Application Context or Executable Program?
|
||||
.
|
||||
To distinguish the two different forms, \fImpirun\fP
|
||||
looks on the command line for \fI--app\fP option. If
|
||||
it is specified, then the file named on the command line is
|
||||
assumed to be an application context. If it is not
|
||||
specified, then the file is assumed to be an executable program.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Locating Files
|
||||
.
|
||||
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||
will look for files by searching the directories in the user's PATH environment
|
||||
variable as defined on the source node(s).
|
||||
.PP
|
||||
If a relative directory is specified, it must be relative to the initial
|
||||
working directory determined by the specific starter used. For example when
|
||||
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||
starters may set the initial directory to the current working directory from
|
||||
the invocation of \fImpirun\fP.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Current Working Directory
|
||||
.
|
||||
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||
directory before their program is invoked. It can also be used in application
|
||||
context files to specify working directories on specific nodes and/or
|
||||
for specific applications.
|
||||
.PP
|
||||
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||
the context file directory will override the command line value.
|
||||
.PP
|
||||
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||
will abort.
|
||||
.PP
|
||||
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||
the directory does not exit on that node), then Open MPI will use the default
|
||||
directory determined by the starter.
|
||||
.PP
|
||||
All directory changing occurs before the user's program is invoked; it
|
||||
does not wait until \fIMPI_INIT\fP is called.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Standard I/O
|
||||
.
|
||||
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||
inherits standard input from \fImpirun\fP.
|
||||
.B Note:
|
||||
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||
\fImpirun\fP's standard input to the rank 0 process.
|
||||
.PP
|
||||
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||
\fImpirun\fP.
|
||||
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||
to it directly.
|
||||
.PP
|
||||
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||
using the typical shell redirection procedure on \fImpirun\fP.
|
||||
|
||||
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||
|
||||
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||
be collected into the \fImy_output\fP file.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Signal Propagation
|
||||
.
|
||||
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||
a small number of seconds, then sending all processes in the job a
|
||||
SIGKILL.
|
||||
.
|
||||
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||
all processes in the job. Other signals are not currently propagated
|
||||
by orterun.
|
||||
.
|
||||
.
|
||||
.SS Process Termination / Signal Handling
|
||||
.
|
||||
During the run of an MPI application, if any rank dies abnormally
|
||||
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||
MPI application.
|
||||
.PP
|
||||
User signal handlers should probably avoid trying to cleanup MPI state
|
||||
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||
will notice that the process died due to a signal, it is probably not
|
||||
necessary (and safest) for the user to only clean up non-MPI state.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Environment
|
||||
.
|
||||
Processes in the MPI application inherit their environment from the
|
||||
Open RTE daemon upon the node on which they are running. The
|
||||
environment is typically inherited from the user's shell. On remote
|
||||
nodes, the exact environment is determined by the boot MCA module
|
||||
used. The \fIrsh\fR launch module, for example, uses either
|
||||
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||
typically executes one or more of the user's shell-setup files before
|
||||
launching the Open RTE daemon. When running dynamically linked
|
||||
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||
variable to be set, care must be taken to ensure that it is correctly
|
||||
set when booting Open MPI.
|
||||
.PP
|
||||
See the "Remote Execution" section for more details.
|
||||
.
|
||||
.
|
||||
.SS Remote Execution
|
||||
.
|
||||
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||
find executables on remote nodes (this is typically only necessary in
|
||||
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||
environments typically copy the current environment to the execution
|
||||
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||
set properly). If Open MPI was compiled with shared library support,
|
||||
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||
variable set on remote nodes as well (especially to find the shared
|
||||
libraries required to run user MPI applications).
|
||||
.PP
|
||||
However, it is not always desirable or possible to edit shell
|
||||
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||
\fI--prefix\fR option is provided for some simple configurations where
|
||||
this is not possible.
|
||||
.PP
|
||||
The \fI--prefix\fR option takes a single argument: the base directory
|
||||
on the remote node where Open MPI is installed. Open MPI will use
|
||||
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||
before executing any Open MPI or user applications. This allows
|
||||
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||
.PP
|
||||
Open MPI adds the basename of the current
|
||||
node's "bindir" (the directory where Open MPI's executables are
|
||||
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||
remote node. Similarly, Open MPI adds the basename of the current
|
||||
node's "libdir" (the directory where Open MPI's libraries are
|
||||
installed) to the prefix and uses that to set the
|
||||
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||
.TP 15
|
||||
Local bindir:
|
||||
/local/node/directory/bin
|
||||
.TP
|
||||
Local libdir:
|
||||
/local/node/directory/lib64
|
||||
.PP
|
||||
If the following command line is used:
|
||||
|
||||
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||
|
||||
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||
remote node before attempting to execute anything.
|
||||
.PP
|
||||
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||
for different values for different nodes.
|
||||
.PP
|
||||
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||
on the remote node are different than the local node (e.g., if "/lib"
|
||||
is used on the local node, but "/lib64" is used on the remote node),
|
||||
or if the installation paths are something other than a subdirectory
|
||||
under a common prefix.
|
||||
.PP
|
||||
Note that executing \fImpirun\fR via an absolute pathname is
|
||||
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||
in the absolute pathname to \fImpirun\fR. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Exported Environment Variables
|
||||
.
|
||||
All environment variables that are named in the form OMPI_* will automatically
|
||||
be exported to new processes on the local and remote nodes.
|
||||
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||
option allows the definition of new variables, note that the parser
|
||||
for this option is currently not very sophisticated - it does not even
|
||||
understand quoted values. Users are advised to set variables in the
|
||||
environment and use \fI\-x\fP to export them; not to define them.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS MCA (Modular Component Architecture)
|
||||
.
|
||||
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||
MCA modules have direct impact on MPI programs because they allow tunable
|
||||
parameters to be set at run time (such as which BTL communication device driver
|
||||
to use, what parameters to pass to that BTL, etc.).
|
||||
.PP
|
||||
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||
passed.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun -mca btl tcp,self -np 1 foo
|
||||
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||
"foo" an allocated node.
|
||||
.
|
||||
.TP
|
||||
mpirun -mca btl self -np 1 foo
|
||||
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||
allocated node.
|
||||
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||
.PP
|
||||
The \fI-mca\fP switch can be used multiple times to specify different
|
||||
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||
(",") separating them.
|
||||
.PP
|
||||
.B Note:
|
||||
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||
The same effect may be accomplished by setting corresponding environment
|
||||
variables before running \fImpirun\fP.
|
||||
The form of the environment variables that Open MPI sets are:
|
||||
|
||||
OMPI_<key>=<value>
|
||||
.PP
|
||||
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||
depends on the specific MCA module.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Examples Section
|
||||
.\" **************************
|
||||
.SH EXAMPLES
|
||||
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||
.
|
||||
.TP 4
|
||||
mpirun -np 1 prog1
|
||||
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||
executable file on each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 8 --byslot prog1
|
||||
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||
of MPI messages.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Diagnostics Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH DIAGNOSTICS
|
||||
.\".TP 4
|
||||
.\"Error Msg:
|
||||
.\"Description
|
||||
.
|
||||
.\" **************************
|
||||
.\" Return Value Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH RETURN VALUE
|
||||
.
|
||||
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||
internal error occurred in mpirun, the corresponding error code is returned.
|
||||
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||
return value of the rank of the process that \fImpirun\fP first notices died
|
||||
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||
be the first rank that died but is not guaranteed to be so.
|
||||
.PP
|
||||
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||
mpirun does not indicate the exit status of the ranks.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH SEE ALSO
|
||||
.\" orted(1)
|
348
orte/tools/orteboot/orteboot.c
Обычный файл
348
orte/tools/orteboot/orteboot.c
Обычный файл
@ -0,0 +1,348 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#ifdef HAVE_LIBGEN_H
|
||||
#include <libgen.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/install_dirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/version.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/util/universe_setup_file_io.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static orte_jobid_t jobid = ORTE_JOBID_INVALID;
|
||||
static char *orteboot_basename = NULL;
|
||||
|
||||
/*
|
||||
* setup globals for catching orteboot command line options
|
||||
*/
|
||||
struct globals_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
bool exit;
|
||||
char *hostfile;
|
||||
char *wdir;
|
||||
opal_mutex_t lock;
|
||||
opal_condition_t cond;
|
||||
} orteboot_globals;
|
||||
|
||||
|
||||
opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||
&orteboot_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||
&orteboot_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit" },
|
||||
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||
&orteboot_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||
&orteboot_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
|
||||
/* Set a hostfile */
|
||||
{ "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
&orteboot_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the working directory of the started processes" },
|
||||
|
||||
/* These arguments can be specified multiple times */
|
||||
{ NULL, NULL, NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* OpenRTE arguments */
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE" },
|
||||
|
||||
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Enable debugging of any OpenRTE daemons used by this application" },
|
||||
|
||||
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
|
||||
|
||||
{ "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not detach OpenRTE daemons used by this application" },
|
||||
|
||||
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the universe name as username@hostname:universe_name for this application" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the root for the session directory tree for orteboot ONLY" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Prefix where Open MPI is installed on remote nodes" },
|
||||
{ NULL, NULL, NULL, '\0', NULL, "noprefix", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Disable automatic --prefix behavior" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
extern char** environ;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int rc, ret;
|
||||
int id, iparam;
|
||||
opal_list_t attributes;
|
||||
opal_cmd_line_t cmd_line;
|
||||
|
||||
OBJ_CONSTRUCT(&orteboot_globals.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orteboot_globals.cond, opal_condition_t);
|
||||
orteboot_globals.hostfile = NULL;
|
||||
orteboot_globals.wdir = NULL;
|
||||
orteboot_globals.help = false;
|
||||
orteboot_globals.version = false;
|
||||
orteboot_globals.verbose = false;
|
||||
orteboot_globals.exit = false;
|
||||
|
||||
/* Setup MCA params */
|
||||
mca_base_param_init();
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
* use it in pretty-print error messages
|
||||
*/
|
||||
orteboot_basename = opal_basename(argv[0]);
|
||||
|
||||
/* Setup and parse the command line */
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_init);
|
||||
mca_base_cmd_line_setup(&cmd_line);
|
||||
if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(&cmd_line, true,
|
||||
argc, argv))) {
|
||||
char *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
opal_show_help("help-orteboot.txt", "orteboot:usage", false,
|
||||
argv[0], args);
|
||||
free(args);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* print version if requested. Do this before check for help so
|
||||
that --version --help works as one might expect. */
|
||||
if (orteboot_globals.version &&
|
||||
!(1 == argc || orteboot_globals.help)) {
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orteboot_basename, "ompiboot")) {
|
||||
project_name = "Open MPI";
|
||||
} else {
|
||||
project_name = "OpenRTE";
|
||||
}
|
||||
opal_show_help("help-orteboot.txt", "orteboot:version", false,
|
||||
orteboot_basename, project_name, OPAL_VERSION,
|
||||
PACKAGE_BUGREPORT);
|
||||
/* if we were the only argument, exit */
|
||||
if (2 == argc) exit(0);
|
||||
}
|
||||
|
||||
/* Check for help request */
|
||||
if (1 == argc || orteboot_globals.help) {
|
||||
char *args = NULL;
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orteboot_basename, "ompiboot")) {
|
||||
project_name = "Open MPI";
|
||||
} else {
|
||||
project_name = "OpenRTE";
|
||||
}
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
opal_show_help("help-orteboot.txt", "orteboot:usage", false,
|
||||
orteboot_basename, project_name, OPAL_VERSION,
|
||||
orteboot_basename, args,
|
||||
PACKAGE_BUGREPORT);
|
||||
free(args);
|
||||
|
||||
/* If someone asks for help, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* check for daemon flags and push them into the environment
|
||||
* since this isn't being automatically done
|
||||
*/
|
||||
id = mca_base_param_reg_int_name("orte_debug", "daemons",
|
||||
"Whether to debug the ORTE daemons or not",
|
||||
false, false, (int)false, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", "debug", "daemons");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||
orteboot_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
id = mca_base_param_reg_int_name("orte", "debug",
|
||||
"Top-level ORTE debug switch",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||
orteboot_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
id = mca_base_param_reg_int_name("orte_debug", "daemons_file",
|
||||
"Whether want stdout/stderr of daemons to go to a file or not",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", "debug",
|
||||
"daemons_file");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||
orteboot_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
id = mca_base_param_reg_int_name("orte", "no_daemonize",
|
||||
"Whether to properly daemonize the ORTE daemons or not",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", "no_daemonize", NULL);
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||
orteboot_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* Intialize our Open RTE environment */
|
||||
/* Set the flag telling orte_init that I am NOT a
|
||||
* singleton, but am "infrastructure" - prevents setting
|
||||
* up incorrect infrastructure that only a singleton would
|
||||
* require
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:init-failure", true,
|
||||
"orte_init()", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Prep to start the virtual machine */
|
||||
/* construct the list of attributes */
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
|
||||
orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_PERNODE, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE);
|
||||
|
||||
/* Create the app - in this case, that's just a no_op to get the daemons launched */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL == app) {
|
||||
opal_show_help("help-orteboot.txt", "orteboot:call-failed",
|
||||
true, orteboot_basename, "system", "malloc returned NULL", errno);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* Spawn the job */
|
||||
|
||||
rc = orte_rmgr.spawn_job(&app, 1, &jobid, 0, NULL, NULL, 0, &attributes);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
/* JMS show_help */
|
||||
opal_output(0, "%s: spawn failed with errno=%d\n", orteboot_basename, rc);
|
||||
}
|
||||
OBJ_DESTRUCT(&attributes);
|
||||
|
||||
|
||||
orte_finalize();
|
||||
free(orteboot_basename);
|
||||
return rc;
|
||||
}
|
||||
|
0
orte/tools/ortehalt/.ompi_ignore
Обычный файл
0
orte/tools/ortehalt/.ompi_ignore
Обычный файл
1
orte/tools/ortehalt/.ompi_unignore
Обычный файл
1
orte/tools/ortehalt/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
||||
rhc
|
39
orte/tools/ortehalt/Makefile.am
Обычный файл
39
orte/tools/ortehalt/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
libs = \
|
||||
$(top_builddir)/orte/liborte.la
|
||||
|
||||
ortehalt_SOURCES = \
|
||||
ortehalt.c
|
||||
|
||||
ortehalt_LDADD = $(libs)
|
||||
ortehalt_DEPENDENCIES = $(libs)
|
||||
|
||||
if OMPI_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = ortehalt
|
||||
|
||||
dist_pkgdata_DATA = help-ortehalt.txt
|
||||
|
||||
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||
man_MANS = ortehalt.1
|
||||
EXTRA_DIST = ortehalt.1
|
||||
|
||||
endif
|
130
orte/tools/ortehalt/help-ortehalt.txt
Обычный файл
130
orte/tools/ortehalt/help-ortehalt.txt
Обычный файл
@ -0,0 +1,130 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orterun:init-failure]
|
||||
Open RTE was unable to initialize properly. The error occured while
|
||||
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:usage]
|
||||
%s (%s) %s
|
||||
|
||||
Usage: %s [OPTION]... [PROGRAM]...
|
||||
Start the given program using Open RTE
|
||||
|
||||
%s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:version]
|
||||
%s (%s) %s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:allocate-resources]
|
||||
%s was unable to allocate enough resources to start your application.
|
||||
This might be a transient error (too many nodes in the cluster were
|
||||
unavailable at the time of the request) or a permenant error (you
|
||||
requsted more nodes than exist in your cluster).
|
||||
|
||||
While probably only useful to Open RTE developers, the error returned
|
||||
was %d.
|
||||
[orterun:error-spawning]
|
||||
%s was unable to start the specified application. An attempt has been
|
||||
made to clean up all processes that did start. The error returned was
|
||||
%d.
|
||||
[orterun:appfile-not-found]
|
||||
Unable to open the appfile:
|
||||
|
||||
%s
|
||||
|
||||
Double check that this file exists and is readable.
|
||||
[orterun:executable-not-specified]
|
||||
No executable was specified on the %s command line.
|
||||
|
||||
Aborting.
|
||||
[orterun:multi-apps-and-zero-np]
|
||||
%s found multiple applications specified on the command line, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
[orterun:nothing-to-do]
|
||||
%s could not find anything to do.
|
||||
|
||||
It is possible that you forgot to specify how many processes to run
|
||||
via the "-np" argument.
|
||||
[orterun:call-failed]
|
||||
%s encountered a %s call failure. This should not happen, and
|
||||
usually indicates an error within the operating system itself.
|
||||
Specifically, the following error occurred:
|
||||
|
||||
%s
|
||||
|
||||
The only other available information that may be helpful is the errno
|
||||
that was returned: %d.
|
||||
[orterun:environ]
|
||||
%s was unable to set
|
||||
%s = %s
|
||||
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:precondition]
|
||||
%s was unable to precondition transports
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:attr-failed]
|
||||
%s was unable to define an attribute
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||
[orterun:abnormal-exit]
|
||||
WARNING: %s encountered an abnormal exit.
|
||||
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
This is a fatal error; %s will now abort. No processes were launched.
|
||||
#
|
||||
[debugger-mca-param-not-found]
|
||||
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||
be found. Please contact the Open RTE developers; this should not
|
||||
happen.
|
||||
#
|
||||
[debugger-orte_base_user_debugger-empty]
|
||||
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||
no user-level debuggers have been defined. Please set this MCA
|
||||
parameter to a value and try again.
|
||||
#
|
||||
[debugger-not-found]
|
||||
A suitable debugger could not be found in your PATH. Check the values
|
||||
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||
debuggers that was searched.
|
||||
#
|
||||
[debugger-exec-failed]
|
||||
%s was unable to launch the specified debugger. This is what was
|
||||
launched:
|
||||
|
||||
%s
|
||||
|
||||
Things to check:
|
||||
|
||||
- Ensure that the debugger is installed properly
|
||||
- Ensure that the "%s" executable is in your path
|
||||
- Ensure that any required licenses are available to run the debugger
|
||||
#
|
||||
[orterun:daemon-die]
|
||||
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||
|
851
orte/tools/ortehalt/ortehalt.1
Обычный файл
851
orte/tools/ortehalt/ortehalt.1
Обычный файл
@ -0,0 +1,851 @@
|
||||
.\"
|
||||
.\" Man page for ORTE's orterun command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||
|
||||
.B Note:
|
||||
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||
other. Using any of the names will result in exactly identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.PP
|
||||
Single Process Multiple Data (SPMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ options ]
|
||||
.B <program>
|
||||
.R [ <args> ]
|
||||
.
|
||||
|
||||
Multiple Instruction Multiple Data (MIMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ global_options ]
|
||||
[ local_options1 ]
|
||||
.B <program1>
|
||||
.R [ <args1> ] :
|
||||
[ local_options2 ]
|
||||
.B <program2>
|
||||
.R [ <args2> ] :
|
||||
... :
|
||||
[ local_optionsN ]
|
||||
.B <programN>
|
||||
.R [ <argsN> ]
|
||||
.P
|
||||
|
||||
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||
resides, minus its last subdirectory. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
|
||||
.
|
||||
.\" **************************
|
||||
.\" Quick Summary Section
|
||||
.\" **************************
|
||||
.SH QUICK SUMMARY
|
||||
.
|
||||
If you are simply looking for how to run an MPI application, you
|
||||
probably want to use a command line of the following form:
|
||||
|
||||
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||
|
||||
This will run X copies of \fI<program>\fR in your current run-time
|
||||
environment (if running under a supported resource manager, Open MPI's
|
||||
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||
which require the use of a hostfile, or will default to running all X
|
||||
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||
CPU slot. See the rest of this page for more details.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH OPTIONS
|
||||
.
|
||||
.I mpirun
|
||||
will send the name of the directory where it was invoked on the local
|
||||
node to each of the remote nodes, and attempt to change to that
|
||||
directory. See the "Current Working Directory" section below for further
|
||||
details.
|
||||
.\"
|
||||
.\" Start options listing
|
||||
.\" Indent 10 chacters from start of first column to start of second column
|
||||
.TP 10
|
||||
.B <args>
|
||||
Pass these run-time arguments to every new process. These must always
|
||||
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||
\fI<args>\fP will be ignored.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B <program>
|
||||
The program executable. This is identified as the first non-recognized argument
|
||||
to mpirun.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||
Set the maximum number of aborted processes to display.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --app \fR<appfile>\fP
|
||||
Provide an appfile, ignoring all other command line options.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -bynode\fR,\fP --bynode
|
||||
Allocate (map) the processes by node in a round-robin scheme.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -byslot\fR,\fP --byslot
|
||||
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||
default.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -c \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debug\fR,\fP --debug
|
||||
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||
MCA parameter.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debugger\fR,\fP --debugger
|
||||
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h\fR,\fP --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -H \fR<host1,host2,...,hostN>\fP
|
||||
Synonym for \fI-host\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||
List of hosts on which to invoke processes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||
Provide a hostfile to use.
|
||||
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||
Synonym for \fI-hostfile\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca\fR,\fP --mca <key> <value>
|
||||
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -n\fR,\fP --n \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nolocal\fR,\fP --nolocal
|
||||
Do not run any copies of the launched application on the same node as
|
||||
orterun is running. This option will override listing the localhost
|
||||
with \fB--host\fR or any other host-specifying mechanism.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||
Do not oversubscribe any nodes; error (without starting any processes)
|
||||
if the requested number of processes would cause oversubscription.
|
||||
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||
each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -np \fR<#>\fP
|
||||
Run this many copies of the program on the given nodes. This option
|
||||
indicates that the specified file is an executable program and not an
|
||||
application context. If no value is provided for the number of copies to
|
||||
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||
line), Open MPI will automatically execute a copy of the program on
|
||||
each process slot (see below for description of a "process slot"). This
|
||||
feature, however, can only be used in the SPMD model and will return an
|
||||
error (without beginning execution of the application) otherwise.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nw\fR,\fP --nw
|
||||
Launch the processes and do not wait for their completion. mpirun will
|
||||
complete as soon as successful launch occurs.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -path\fR,\fP --path \fR<path>\fP
|
||||
<path> that will be used when attempting to locate requested executables.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --prefix \fR<dir>\fP
|
||||
Prefix directory that will be used to set the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||
the target process. See the "Remote Execution" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -q\fR,\fP --quiet
|
||||
Suppress informative messages from orterun during application execution.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --tmpdir \fR<dir>\fP
|
||||
Set the root for the session directory tree for mpirun only.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -tv\fR,\fP --tv
|
||||
Launch processes under the TotalView debugger.
|
||||
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --universe \fR<username@hostname:universe_name>\fP
|
||||
For this application, set the universe name as:
|
||||
username@hostname:universe_name
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v\fR,\fP --verbose
|
||||
Be verbose
|
||||
.TP
|
||||
.B -V\fR,\fP --version
|
||||
Print version number. If no other arguments are given, this will also
|
||||
cause orterun to exit.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -wd \fR<dir>\fP
|
||||
Change to the directory <dir> before the user's program executes.
|
||||
See the "Current Working Directory" section for notes on relative paths.
|
||||
.B Note:
|
||||
If the \fI-wd\fP option appears both on the command line and in an
|
||||
application context, the context will take precedence over the command line.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -x \fR<env>\fP
|
||||
Export the specified environment variables to the remote nodes before
|
||||
executing the program. Existing environment variables can be
|
||||
specified (see the Examples section, below), or new variable names
|
||||
specified with corresponding values. The parser for the \fI-x\fP
|
||||
option is not very sophisticated; it does not even understand quoted
|
||||
values. Users are advised to set variables in the environment, and
|
||||
then use \fI-x\fP to export (not define) them.
|
||||
.
|
||||
.
|
||||
.P
|
||||
The following options are useful for developers; they are not generally
|
||||
useful to most ORTE and/or MPI users:
|
||||
.
|
||||
.TP
|
||||
.B -d\fR,\fP --debug-devel
|
||||
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||
This is not generally useful for most users.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons
|
||||
Enable debugging of any OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons-file
|
||||
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||
output in files.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --no-daemonize
|
||||
Do not detach OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||
MPI. If the application is single process multiple data (SPMD), the application
|
||||
can be specified on the \fImpirun\fP command line.
|
||||
|
||||
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||
multiple programs, the set of programs and argument can be specified in one of
|
||||
two ways: Extended Command Line Arguments, and Application Context.
|
||||
.PP
|
||||
An application context describes the MIMD program set including all arguments
|
||||
in a separate file.
|
||||
.\"See appcontext(5) for a description of the application context syntax.
|
||||
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||
command name itself. The ability to specify different options for different
|
||||
instantiations of a program is another reason to use an application context.
|
||||
.PP
|
||||
Extended command line arguments allow for the description of the application
|
||||
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||
of programs and arguments. Some options are globally set across all specified
|
||||
programs (e.g. --hostfile), while others are specific to a single program
|
||||
(e.g. -np).
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Slots
|
||||
.
|
||||
Open MPI uses "slots" to represent a potential location for a process.
|
||||
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||
that node. For performance, the community typically equates a "slot"
|
||||
with a physical CPU, thus ensuring that any process assigned to that
|
||||
slot has a dedicated processor. This is not, however, a requirement for
|
||||
the operation of Open MPI.
|
||||
.PP
|
||||
Slots can be specified in hostfiles after the hostname. For example:
|
||||
.
|
||||
.TP 4
|
||||
host1.example.com slots=4
|
||||
Indicates that there are 4 process slots on host1.
|
||||
.
|
||||
.PP
|
||||
If no slots value is specified, then Open MPI will automatically assign
|
||||
a default value of "slots=1" to that host.
|
||||
.
|
||||
.PP
|
||||
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||
MPI will obtain both the hostnames and the number of slots directly
|
||||
from the resource manger. For example, if running under a SLURM job,
|
||||
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||
to the job as well as how many slots on each node that SLURM says
|
||||
are usable - in most high-performance environments, the slots will
|
||||
equate to the number of processors on the node.
|
||||
.
|
||||
.PP
|
||||
When deciding where to launch processes, Open MPI will first fill up
|
||||
all available slots before oversubscribing (see "Location
|
||||
Nomenclature", below, for more details on the scheduling algorithms
|
||||
available). Unless told otherwise, Open MPI will arbitrarily
|
||||
oversubscribe nodes. For example, if the only node available is the
|
||||
localhost, Open MPI will run as many processes as specified by the
|
||||
-n (or one of its variants) command line option on the
|
||||
localhost (although they may run quite slowly, since they'll all be
|
||||
competing for CPU and other resources).
|
||||
.
|
||||
.PP
|
||||
Limits can be placed on oversubscription with the "max_slots"
|
||||
attribute in the hostfile. For example:
|
||||
.
|
||||
.TP 4
|
||||
host2.example.com slots=4 max_slots=6
|
||||
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||
is limited to launching a maximum of 6 processes on host2.
|
||||
.
|
||||
.TP
|
||||
host3.example.com slots=2 max_slots=2
|
||||
Indicates that there are 2 process slots on host3 and that no
|
||||
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||
option).
|
||||
.
|
||||
.TP
|
||||
host4.example.com max_slots=2
|
||||
Shorthand; same as listing "slots=2 max_slots=2".
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Note that Open MPI's support for resource managers does not currently
|
||||
set the "max_slots" values for hosts. If you wish to prevent
|
||||
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||
option.
|
||||
.
|
||||
.PP
|
||||
In scenarios where the user wishes to launch an application across
|
||||
all available slots by not providing a "-n" option on the mpirun
|
||||
command line, Open MPI will launch a process on each process slot
|
||||
for each host within the provided environment. For example, if a
|
||||
hostfile has been provided, then Open MPI will spawn processes
|
||||
on each identified host up to the "slots=x" limit if oversubscription
|
||||
is not allowed. If oversubscription is allowed (the default), then
|
||||
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||
mapping directives will be enforced to ensure proper placement of
|
||||
process ranks.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Location Nomenclature
|
||||
.
|
||||
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||
the current Open MPI universe. Locations can be specified either by
|
||||
CPU or by node.
|
||||
|
||||
.B Note:
|
||||
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||
specifying a location "by CPU" is really a convenience mechanism for
|
||||
SMPs that ultimately maps down to a specific node.
|
||||
.PP
|
||||
Specifying locations by node will launch one copy of an executable per
|
||||
specified node.
|
||||
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||
node before allocating resources on the next available node.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun --bynode -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||
are available on each of the nodes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --byslot -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on each slot on a given node before running the executable on other available
|
||||
nodes.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Specifying Hosts
|
||||
.
|
||||
Hosts can be specified in a number of ways. The most common of which is in a
|
||||
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||
.
|
||||
.
|
||||
|
||||
\fBshell$\fP cat my-hostfile
|
||||
node00 slots=2
|
||||
node01 slots=2
|
||||
node02 slots=2
|
||||
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --hostfile my-hostfile -np 3 a.out
|
||||
This will run one copy of the executable
|
||||
.I a.out
|
||||
on hosts node00,node01, and node02.
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Another method for specifying hosts is directly on the command line. Here can
|
||||
can include and exclude hosts from the set of hosts to run on. For example:
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host a.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a,b,c a.out
|
||||
Runs one copy of the executable
|
||||
.I a.out
|
||||
on hosts a, b, and c.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host node00.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||
abort.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||
Runs one copy of the executable
|
||||
.I hostname
|
||||
on host a. And runs one copy of the executable
|
||||
.I uptime
|
||||
on hosts b and c.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Local Launch
|
||||
.
|
||||
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||
launch any of the application processes on the same node that orterun
|
||||
is running. While orterun typically blocks and consumes few system
|
||||
resources, this option can be helpful for launching very large jobs
|
||||
where orterun may actually need to use noticable amounts of memory
|
||||
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||
sharing the local node with the launched applications, and likewise
|
||||
allows the launched applications to run unhindered by orterun's system
|
||||
usage.
|
||||
.PP
|
||||
Note that \fB--nolocal\fR will override any other specification to
|
||||
launch the application on the local node. It will disqualify the
|
||||
localhost from being capable of running any processes in the
|
||||
application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||
This example will result in an error because orterun will not find
|
||||
anywhere to launch the application.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Oversubscription
|
||||
.
|
||||
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||
set the "max_slots" value to be the same as the "slots" value for each
|
||||
node. This can be especially helpful when running jobs under a
|
||||
resource manager because Open MPI currently only sets the "slots"
|
||||
value for each node that it obtains from the resource manager.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Application Context or Executable Program?
|
||||
.
|
||||
To distinguish the two different forms, \fImpirun\fP
|
||||
looks on the command line for \fI--app\fP option. If
|
||||
it is specified, then the file named on the command line is
|
||||
assumed to be an application context. If it is not
|
||||
specified, then the file is assumed to be an executable program.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Locating Files
|
||||
.
|
||||
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||
will look for files by searching the directories in the user's PATH environment
|
||||
variable as defined on the source node(s).
|
||||
.PP
|
||||
If a relative directory is specified, it must be relative to the initial
|
||||
working directory determined by the specific starter used. For example when
|
||||
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||
starters may set the initial directory to the current working directory from
|
||||
the invocation of \fImpirun\fP.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Current Working Directory
|
||||
.
|
||||
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||
directory before their program is invoked. It can also be used in application
|
||||
context files to specify working directories on specific nodes and/or
|
||||
for specific applications.
|
||||
.PP
|
||||
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||
the context file directory will override the command line value.
|
||||
.PP
|
||||
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||
will abort.
|
||||
.PP
|
||||
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||
the directory does not exit on that node), then Open MPI will use the default
|
||||
directory determined by the starter.
|
||||
.PP
|
||||
All directory changing occurs before the user's program is invoked; it
|
||||
does not wait until \fIMPI_INIT\fP is called.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Standard I/O
|
||||
.
|
||||
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||
inherits standard input from \fImpirun\fP.
|
||||
.B Note:
|
||||
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||
\fImpirun\fP's standard input to the rank 0 process.
|
||||
.PP
|
||||
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||
\fImpirun\fP.
|
||||
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||
to it directly.
|
||||
.PP
|
||||
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||
using the typical shell redirection procedure on \fImpirun\fP.
|
||||
|
||||
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||
|
||||
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||
be collected into the \fImy_output\fP file.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Signal Propagation
|
||||
.
|
||||
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||
a small number of seconds, then sending all processes in the job a
|
||||
SIGKILL.
|
||||
.
|
||||
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||
all processes in the job. Other signals are not currently propagated
|
||||
by orterun.
|
||||
.
|
||||
.
|
||||
.SS Process Termination / Signal Handling
|
||||
.
|
||||
During the run of an MPI application, if any rank dies abnormally
|
||||
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||
MPI application.
|
||||
.PP
|
||||
User signal handlers should probably avoid trying to cleanup MPI state
|
||||
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||
will notice that the process died due to a signal, it is probably not
|
||||
necessary (and safest) for the user to only clean up non-MPI state.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Environment
|
||||
.
|
||||
Processes in the MPI application inherit their environment from the
|
||||
Open RTE daemon upon the node on which they are running. The
|
||||
environment is typically inherited from the user's shell. On remote
|
||||
nodes, the exact environment is determined by the boot MCA module
|
||||
used. The \fIrsh\fR launch module, for example, uses either
|
||||
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||
typically executes one or more of the user's shell-setup files before
|
||||
launching the Open RTE daemon. When running dynamically linked
|
||||
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||
variable to be set, care must be taken to ensure that it is correctly
|
||||
set when booting Open MPI.
|
||||
.PP
|
||||
See the "Remote Execution" section for more details.
|
||||
.
|
||||
.
|
||||
.SS Remote Execution
|
||||
.
|
||||
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||
find executables on remote nodes (this is typically only necessary in
|
||||
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||
environments typically copy the current environment to the execution
|
||||
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||
set properly). If Open MPI was compiled with shared library support,
|
||||
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||
variable set on remote nodes as well (especially to find the shared
|
||||
libraries required to run user MPI applications).
|
||||
.PP
|
||||
However, it is not always desirable or possible to edit shell
|
||||
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||
\fI--prefix\fR option is provided for some simple configurations where
|
||||
this is not possible.
|
||||
.PP
|
||||
The \fI--prefix\fR option takes a single argument: the base directory
|
||||
on the remote node where Open MPI is installed. Open MPI will use
|
||||
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||
before executing any Open MPI or user applications. This allows
|
||||
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||
.PP
|
||||
Open MPI adds the basename of the current
|
||||
node's "bindir" (the directory where Open MPI's executables are
|
||||
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||
remote node. Similarly, Open MPI adds the basename of the current
|
||||
node's "libdir" (the directory where Open MPI's libraries are
|
||||
installed) to the prefix and uses that to set the
|
||||
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||
.TP 15
|
||||
Local bindir:
|
||||
/local/node/directory/bin
|
||||
.TP
|
||||
Local libdir:
|
||||
/local/node/directory/lib64
|
||||
.PP
|
||||
If the following command line is used:
|
||||
|
||||
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||
|
||||
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||
remote node before attempting to execute anything.
|
||||
.PP
|
||||
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||
for different values for different nodes.
|
||||
.PP
|
||||
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||
on the remote node are different than the local node (e.g., if "/lib"
|
||||
is used on the local node, but "/lib64" is used on the remote node),
|
||||
or if the installation paths are something other than a subdirectory
|
||||
under a common prefix.
|
||||
.PP
|
||||
Note that executing \fImpirun\fR via an absolute pathname is
|
||||
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||
in the absolute pathname to \fImpirun\fR. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Exported Environment Variables
|
||||
.
|
||||
All environment variables that are named in the form OMPI_* will automatically
|
||||
be exported to new processes on the local and remote nodes.
|
||||
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||
option allows the definition of new variables, note that the parser
|
||||
for this option is currently not very sophisticated - it does not even
|
||||
understand quoted values. Users are advised to set variables in the
|
||||
environment and use \fI\-x\fP to export them; not to define them.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS MCA (Modular Component Architecture)
|
||||
.
|
||||
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||
MCA modules have direct impact on MPI programs because they allow tunable
|
||||
parameters to be set at run time (such as which BTL communication device driver
|
||||
to use, what parameters to pass to that BTL, etc.).
|
||||
.PP
|
||||
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||
passed.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun -mca btl tcp,self -np 1 foo
|
||||
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||
"foo" an allocated node.
|
||||
.
|
||||
.TP
|
||||
mpirun -mca btl self -np 1 foo
|
||||
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||
allocated node.
|
||||
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||
.PP
|
||||
The \fI-mca\fP switch can be used multiple times to specify different
|
||||
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||
(",") separating them.
|
||||
.PP
|
||||
.B Note:
|
||||
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||
The same effect may be accomplished by setting corresponding environment
|
||||
variables before running \fImpirun\fP.
|
||||
The form of the environment variables that Open MPI sets are:
|
||||
|
||||
OMPI_<key>=<value>
|
||||
.PP
|
||||
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||
depends on the specific MCA module.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Examples Section
|
||||
.\" **************************
|
||||
.SH EXAMPLES
|
||||
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||
.
|
||||
.TP 4
|
||||
mpirun -np 1 prog1
|
||||
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||
executable file on each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 8 --byslot prog1
|
||||
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||
of MPI messages.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Diagnostics Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH DIAGNOSTICS
|
||||
.\".TP 4
|
||||
.\"Error Msg:
|
||||
.\"Description
|
||||
.
|
||||
.\" **************************
|
||||
.\" Return Value Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH RETURN VALUE
|
||||
.
|
||||
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||
internal error occurred in mpirun, the corresponding error code is returned.
|
||||
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||
return value of the rank of the process that \fImpirun\fP first notices died
|
||||
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||
be the first rank that died but is not guaranteed to be so.
|
||||
.PP
|
||||
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||
mpirun does not indicate the exit status of the ranks.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH SEE ALSO
|
||||
.\" orted(1)
|
177
orte/tools/ortehalt/ortehalt.c
Обычный файл
177
orte/tools/ortehalt/ortehalt.c
Обычный файл
@ -0,0 +1,177 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#ifdef HAVE_LIBGEN_H
|
||||
#include <libgen.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/install_dirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/version.h"
|
||||
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/util/universe_setup_file_io.h"
|
||||
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
static char *orte_basename = NULL;
|
||||
|
||||
/*
|
||||
* setup globals for catching orterun command line options
|
||||
*/
|
||||
struct globals_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
bool exit;
|
||||
int exit_status;
|
||||
char *wdir;
|
||||
char *path;
|
||||
opal_mutex_t lock;
|
||||
opal_condition_t cond;
|
||||
} ortehalt_globals;
|
||||
|
||||
|
||||
opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||
&ortehalt_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||
&ortehalt_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit" },
|
||||
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||
&ortehalt_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||
&ortehalt_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
|
||||
/* OpenRTE arguments */
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE" },
|
||||
|
||||
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the universe name as username@hostname:universe_name for this application" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the root for the session directory tree for orterun ONLY" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
extern char** environ;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int rc;
|
||||
int id, iparam;
|
||||
|
||||
/* Setup MCA params */
|
||||
|
||||
mca_base_param_init();
|
||||
orte_register_params(false);
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
orte_basename = opal_basename(argv[0]);
|
||||
|
||||
/* check for daemon flags and push them into the environment
|
||||
* since this isn't being automatically done
|
||||
*/
|
||||
id = mca_base_param_reg_int_name("orte", "debug",
|
||||
"Top-level ORTE debug switch",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-ortehalt.txt", "ortehalt:environ", false,
|
||||
orte_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* Intialize our Open RTE environment */
|
||||
/* Set the flag telling orte_init that I am NOT a
|
||||
* singleton, but am "infrastructure" - prevents setting
|
||||
* up incorrect infrastructure that only a singleton would
|
||||
* require
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:init-failure", true,
|
||||
"orte_init()", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
orte_finalize();
|
||||
free(orte_basename);
|
||||
return rc;
|
||||
}
|
0
orte/tools/ortekill/.ompi_ignore
Обычный файл
0
orte/tools/ortekill/.ompi_ignore
Обычный файл
1
orte/tools/ortekill/.ompi_unignore
Обычный файл
1
orte/tools/ortekill/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
||||
rhc
|
39
orte/tools/ortekill/Makefile.am
Обычный файл
39
orte/tools/ortekill/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
libs = \
|
||||
$(top_builddir)/orte/liborte.la
|
||||
|
||||
ortekill_SOURCES = \
|
||||
ortekill.c
|
||||
|
||||
ortekill_LDADD = $(libs)
|
||||
ortekill_DEPENDENCIES = $(libs)
|
||||
|
||||
if OMPI_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = ortekill
|
||||
|
||||
dist_pkgdata_DATA = help-ortekill.txt
|
||||
|
||||
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||
man_MANS = ortekill.1
|
||||
EXTRA_DIST = ortekill.1
|
||||
|
||||
endif
|
130
orte/tools/ortekill/help-ortekill.txt
Обычный файл
130
orte/tools/ortekill/help-ortekill.txt
Обычный файл
@ -0,0 +1,130 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orterun:init-failure]
|
||||
Open RTE was unable to initialize properly. The error occured while
|
||||
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:usage]
|
||||
%s (%s) %s
|
||||
|
||||
Usage: %s [OPTION]... [PROGRAM]...
|
||||
Start the given program using Open RTE
|
||||
|
||||
%s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:version]
|
||||
%s (%s) %s
|
||||
|
||||
Report bugs to %s
|
||||
[orterun:allocate-resources]
|
||||
%s was unable to allocate enough resources to start your application.
|
||||
This might be a transient error (too many nodes in the cluster were
|
||||
unavailable at the time of the request) or a permenant error (you
|
||||
requsted more nodes than exist in your cluster).
|
||||
|
||||
While probably only useful to Open RTE developers, the error returned
|
||||
was %d.
|
||||
[orterun:error-spawning]
|
||||
%s was unable to start the specified application. An attempt has been
|
||||
made to clean up all processes that did start. The error returned was
|
||||
%d.
|
||||
[orterun:appfile-not-found]
|
||||
Unable to open the appfile:
|
||||
|
||||
%s
|
||||
|
||||
Double check that this file exists and is readable.
|
||||
[orterun:executable-not-specified]
|
||||
No executable was specified on the %s command line.
|
||||
|
||||
Aborting.
|
||||
[orterun:multi-apps-and-zero-np]
|
||||
%s found multiple applications specified on the command line, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
[orterun:nothing-to-do]
|
||||
%s could not find anything to do.
|
||||
|
||||
It is possible that you forgot to specify how many processes to run
|
||||
via the "-np" argument.
|
||||
[orterun:call-failed]
|
||||
%s encountered a %s call failure. This should not happen, and
|
||||
usually indicates an error within the operating system itself.
|
||||
Specifically, the following error occurred:
|
||||
|
||||
%s
|
||||
|
||||
The only other available information that may be helpful is the errno
|
||||
that was returned: %d.
|
||||
[orterun:environ]
|
||||
%s was unable to set
|
||||
%s = %s
|
||||
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:precondition]
|
||||
%s was unable to precondition transports
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:attr-failed]
|
||||
%s was unable to define an attribute
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||
[orterun:abnormal-exit]
|
||||
WARNING: %s encountered an abnormal exit.
|
||||
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
This is a fatal error; %s will now abort. No processes were launched.
|
||||
#
|
||||
[debugger-mca-param-not-found]
|
||||
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||
be found. Please contact the Open RTE developers; this should not
|
||||
happen.
|
||||
#
|
||||
[debugger-orte_base_user_debugger-empty]
|
||||
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||
no user-level debuggers have been defined. Please set this MCA
|
||||
parameter to a value and try again.
|
||||
#
|
||||
[debugger-not-found]
|
||||
A suitable debugger could not be found in your PATH. Check the values
|
||||
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||
debuggers that was searched.
|
||||
#
|
||||
[debugger-exec-failed]
|
||||
%s was unable to launch the specified debugger. This is what was
|
||||
launched:
|
||||
|
||||
%s
|
||||
|
||||
Things to check:
|
||||
|
||||
- Ensure that the debugger is installed properly
|
||||
- Ensure that the "%s" executable is in your path
|
||||
- Ensure that any required licenses are available to run the debugger
|
||||
#
|
||||
[orterun:daemon-die]
|
||||
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||
|
851
orte/tools/ortekill/ortekill.1
Обычный файл
851
orte/tools/ortekill/ortekill.1
Обычный файл
@ -0,0 +1,851 @@
|
||||
.\"
|
||||
.\" Man page for ORTE's orterun command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||
|
||||
.B Note:
|
||||
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||
other. Using any of the names will result in exactly identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.PP
|
||||
Single Process Multiple Data (SPMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ options ]
|
||||
.B <program>
|
||||
.R [ <args> ]
|
||||
.
|
||||
|
||||
Multiple Instruction Multiple Data (MIMD) Model:
|
||||
|
||||
.B mpirun
|
||||
.R [ global_options ]
|
||||
[ local_options1 ]
|
||||
.B <program1>
|
||||
.R [ <args1> ] :
|
||||
[ local_options2 ]
|
||||
.B <program2>
|
||||
.R [ <args2> ] :
|
||||
... :
|
||||
[ local_optionsN ]
|
||||
.B <programN>
|
||||
.R [ <argsN> ]
|
||||
.P
|
||||
|
||||
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||
resides, minus its last subdirectory. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
|
||||
.
|
||||
.\" **************************
|
||||
.\" Quick Summary Section
|
||||
.\" **************************
|
||||
.SH QUICK SUMMARY
|
||||
.
|
||||
If you are simply looking for how to run an MPI application, you
|
||||
probably want to use a command line of the following form:
|
||||
|
||||
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||
|
||||
This will run X copies of \fI<program>\fR in your current run-time
|
||||
environment (if running under a supported resource manager, Open MPI's
|
||||
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||
which require the use of a hostfile, or will default to running all X
|
||||
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||
CPU slot. See the rest of this page for more details.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH OPTIONS
|
||||
.
|
||||
.I mpirun
|
||||
will send the name of the directory where it was invoked on the local
|
||||
node to each of the remote nodes, and attempt to change to that
|
||||
directory. See the "Current Working Directory" section below for further
|
||||
details.
|
||||
.\"
|
||||
.\" Start options listing
|
||||
.\" Indent 10 chacters from start of first column to start of second column
|
||||
.TP 10
|
||||
.B <args>
|
||||
Pass these run-time arguments to every new process. These must always
|
||||
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||
\fI<args>\fP will be ignored.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B <program>
|
||||
The program executable. This is identified as the first non-recognized argument
|
||||
to mpirun.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||
Set the maximum number of aborted processes to display.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --app \fR<appfile>\fP
|
||||
Provide an appfile, ignoring all other command line options.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -bynode\fR,\fP --bynode
|
||||
Allocate (map) the processes by node in a round-robin scheme.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -byslot\fR,\fP --byslot
|
||||
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||
default.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -c \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debug\fR,\fP --debug
|
||||
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||
MCA parameter.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -debugger\fR,\fP --debugger
|
||||
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h\fR,\fP --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -H \fR<host1,host2,...,hostN>\fP
|
||||
Synonym for \fI-host\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||
List of hosts on which to invoke processes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||
Provide a hostfile to use.
|
||||
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||
Synonym for \fI-hostfile\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca\fR,\fP --mca <key> <value>
|
||||
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -n\fR,\fP --n \fR<#>\fP
|
||||
Synonym for \fI-np\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nolocal\fR,\fP --nolocal
|
||||
Do not run any copies of the launched application on the same node as
|
||||
orterun is running. This option will override listing the localhost
|
||||
with \fB--host\fR or any other host-specifying mechanism.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||
Do not oversubscribe any nodes; error (without starting any processes)
|
||||
if the requested number of processes would cause oversubscription.
|
||||
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||
each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -np \fR<#>\fP
|
||||
Run this many copies of the program on the given nodes. This option
|
||||
indicates that the specified file is an executable program and not an
|
||||
application context. If no value is provided for the number of copies to
|
||||
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||
line), Open MPI will automatically execute a copy of the program on
|
||||
each process slot (see below for description of a "process slot"). This
|
||||
feature, however, can only be used in the SPMD model and will return an
|
||||
error (without beginning execution of the application) otherwise.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -nw\fR,\fP --nw
|
||||
Launch the processes and do not wait for their completion. mpirun will
|
||||
complete as soon as successful launch occurs.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -path\fR,\fP --path \fR<path>\fP
|
||||
<path> that will be used when attempting to locate requested executables.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --prefix \fR<dir>\fP
|
||||
Prefix directory that will be used to set the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||
the target process. See the "Remote Execution" section, below.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -q\fR,\fP --quiet
|
||||
Suppress informative messages from orterun during application execution.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --tmpdir \fR<dir>\fP
|
||||
Set the root for the session directory tree for mpirun only.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -tv\fR,\fP --tv
|
||||
Launch processes under the TotalView debugger.
|
||||
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --universe \fR<username@hostname:universe_name>\fP
|
||||
For this application, set the universe name as:
|
||||
username@hostname:universe_name
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v\fR,\fP --verbose
|
||||
Be verbose
|
||||
.TP
|
||||
.B -V\fR,\fP --version
|
||||
Print version number. If no other arguments are given, this will also
|
||||
cause orterun to exit.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -wd \fR<dir>\fP
|
||||
Change to the directory <dir> before the user's program executes.
|
||||
See the "Current Working Directory" section for notes on relative paths.
|
||||
.B Note:
|
||||
If the \fI-wd\fP option appears both on the command line and in an
|
||||
application context, the context will take precedence over the command line.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -x \fR<env>\fP
|
||||
Export the specified environment variables to the remote nodes before
|
||||
executing the program. Existing environment variables can be
|
||||
specified (see the Examples section, below), or new variable names
|
||||
specified with corresponding values. The parser for the \fI-x\fP
|
||||
option is not very sophisticated; it does not even understand quoted
|
||||
values. Users are advised to set variables in the environment, and
|
||||
then use \fI-x\fP to export (not define) them.
|
||||
.
|
||||
.
|
||||
.P
|
||||
The following options are useful for developers; they are not generally
|
||||
useful to most ORTE and/or MPI users:
|
||||
.
|
||||
.TP
|
||||
.B -d\fR,\fP --debug-devel
|
||||
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||
This is not generally useful for most users.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons
|
||||
Enable debugging of any OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons-file
|
||||
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||
output in files.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --no-daemonize
|
||||
Do not detach OpenRTE daemons used by this application.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||
MPI. If the application is single process multiple data (SPMD), the application
|
||||
can be specified on the \fImpirun\fP command line.
|
||||
|
||||
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||
multiple programs, the set of programs and argument can be specified in one of
|
||||
two ways: Extended Command Line Arguments, and Application Context.
|
||||
.PP
|
||||
An application context describes the MIMD program set including all arguments
|
||||
in a separate file.
|
||||
.\"See appcontext(5) for a description of the application context syntax.
|
||||
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||
command name itself. The ability to specify different options for different
|
||||
instantiations of a program is another reason to use an application context.
|
||||
.PP
|
||||
Extended command line arguments allow for the description of the application
|
||||
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||
of programs and arguments. Some options are globally set across all specified
|
||||
programs (e.g. --hostfile), while others are specific to a single program
|
||||
(e.g. -np).
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Slots
|
||||
.
|
||||
Open MPI uses "slots" to represent a potential location for a process.
|
||||
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||
that node. For performance, the community typically equates a "slot"
|
||||
with a physical CPU, thus ensuring that any process assigned to that
|
||||
slot has a dedicated processor. This is not, however, a requirement for
|
||||
the operation of Open MPI.
|
||||
.PP
|
||||
Slots can be specified in hostfiles after the hostname. For example:
|
||||
.
|
||||
.TP 4
|
||||
host1.example.com slots=4
|
||||
Indicates that there are 4 process slots on host1.
|
||||
.
|
||||
.PP
|
||||
If no slots value is specified, then Open MPI will automatically assign
|
||||
a default value of "slots=1" to that host.
|
||||
.
|
||||
.PP
|
||||
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||
MPI will obtain both the hostnames and the number of slots directly
|
||||
from the resource manger. For example, if running under a SLURM job,
|
||||
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||
to the job as well as how many slots on each node that SLURM says
|
||||
are usable - in most high-performance environments, the slots will
|
||||
equate to the number of processors on the node.
|
||||
.
|
||||
.PP
|
||||
When deciding where to launch processes, Open MPI will first fill up
|
||||
all available slots before oversubscribing (see "Location
|
||||
Nomenclature", below, for more details on the scheduling algorithms
|
||||
available). Unless told otherwise, Open MPI will arbitrarily
|
||||
oversubscribe nodes. For example, if the only node available is the
|
||||
localhost, Open MPI will run as many processes as specified by the
|
||||
-n (or one of its variants) command line option on the
|
||||
localhost (although they may run quite slowly, since they'll all be
|
||||
competing for CPU and other resources).
|
||||
.
|
||||
.PP
|
||||
Limits can be placed on oversubscription with the "max_slots"
|
||||
attribute in the hostfile. For example:
|
||||
.
|
||||
.TP 4
|
||||
host2.example.com slots=4 max_slots=6
|
||||
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||
is limited to launching a maximum of 6 processes on host2.
|
||||
.
|
||||
.TP
|
||||
host3.example.com slots=2 max_slots=2
|
||||
Indicates that there are 2 process slots on host3 and that no
|
||||
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||
option).
|
||||
.
|
||||
.TP
|
||||
host4.example.com max_slots=2
|
||||
Shorthand; same as listing "slots=2 max_slots=2".
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Note that Open MPI's support for resource managers does not currently
|
||||
set the "max_slots" values for hosts. If you wish to prevent
|
||||
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||
option.
|
||||
.
|
||||
.PP
|
||||
In scenarios where the user wishes to launch an application across
|
||||
all available slots by not providing a "-n" option on the mpirun
|
||||
command line, Open MPI will launch a process on each process slot
|
||||
for each host within the provided environment. For example, if a
|
||||
hostfile has been provided, then Open MPI will spawn processes
|
||||
on each identified host up to the "slots=x" limit if oversubscription
|
||||
is not allowed. If oversubscription is allowed (the default), then
|
||||
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||
mapping directives will be enforced to ensure proper placement of
|
||||
process ranks.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Location Nomenclature
|
||||
.
|
||||
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||
the current Open MPI universe. Locations can be specified either by
|
||||
CPU or by node.
|
||||
|
||||
.B Note:
|
||||
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||
specifying a location "by CPU" is really a convenience mechanism for
|
||||
SMPs that ultimately maps down to a specific node.
|
||||
.PP
|
||||
Specifying locations by node will launch one copy of an executable per
|
||||
specified node.
|
||||
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||
node before allocating resources on the next available node.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun --bynode -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||
are available on each of the nodes.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --byslot -np 4 a.out
|
||||
Runs one copy of the the executable
|
||||
.I a.out
|
||||
on each slot on a given node before running the executable on other available
|
||||
nodes.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Specifying Hosts
|
||||
.
|
||||
Hosts can be specified in a number of ways. The most common of which is in a
|
||||
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||
.
|
||||
.
|
||||
|
||||
\fBshell$\fP cat my-hostfile
|
||||
node00 slots=2
|
||||
node01 slots=2
|
||||
node02 slots=2
|
||||
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun --hostfile my-hostfile -np 3 a.out
|
||||
This will run one copy of the executable
|
||||
.I a.out
|
||||
on hosts node00,node01, and node02.
|
||||
.
|
||||
.
|
||||
.PP
|
||||
Another method for specifying hosts is directly on the command line. Here can
|
||||
can include and exclude hosts from the set of hosts to run on. For example:
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host a.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --host a,b,c a.out
|
||||
Runs one copy of the executable
|
||||
.I a.out
|
||||
on hosts a, b, and c.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||
Runs three copies of the executable
|
||||
.I a.out
|
||||
on host node00.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||
abort.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||
Runs one copy of the executable
|
||||
.I hostname
|
||||
on host a. And runs one copy of the executable
|
||||
.I uptime
|
||||
on hosts b and c.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Local Launch
|
||||
.
|
||||
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||
launch any of the application processes on the same node that orterun
|
||||
is running. While orterun typically blocks and consumes few system
|
||||
resources, this option can be helpful for launching very large jobs
|
||||
where orterun may actually need to use noticable amounts of memory
|
||||
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||
sharing the local node with the launched applications, and likewise
|
||||
allows the launched applications to run unhindered by orterun's system
|
||||
usage.
|
||||
.PP
|
||||
Note that \fB--nolocal\fR will override any other specification to
|
||||
launch the application on the local node. It will disqualify the
|
||||
localhost from being capable of running any processes in the
|
||||
application.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||
This example will result in an error because orterun will not find
|
||||
anywhere to launch the application.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS No Oversubscription
|
||||
.
|
||||
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||
set the "max_slots" value to be the same as the "slots" value for each
|
||||
node. This can be especially helpful when running jobs under a
|
||||
resource manager because Open MPI currently only sets the "slots"
|
||||
value for each node that it obtains from the resource manager.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Application Context or Executable Program?
|
||||
.
|
||||
To distinguish the two different forms, \fImpirun\fP
|
||||
looks on the command line for \fI--app\fP option. If
|
||||
it is specified, then the file named on the command line is
|
||||
assumed to be an application context. If it is not
|
||||
specified, then the file is assumed to be an executable program.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Locating Files
|
||||
.
|
||||
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||
will look for files by searching the directories in the user's PATH environment
|
||||
variable as defined on the source node(s).
|
||||
.PP
|
||||
If a relative directory is specified, it must be relative to the initial
|
||||
working directory determined by the specific starter used. For example when
|
||||
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||
starters may set the initial directory to the current working directory from
|
||||
the invocation of \fImpirun\fP.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Current Working Directory
|
||||
.
|
||||
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||
directory before their program is invoked. It can also be used in application
|
||||
context files to specify working directories on specific nodes and/or
|
||||
for specific applications.
|
||||
.PP
|
||||
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||
the context file directory will override the command line value.
|
||||
.PP
|
||||
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||
will abort.
|
||||
.PP
|
||||
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||
the directory does not exit on that node), then Open MPI will use the default
|
||||
directory determined by the starter.
|
||||
.PP
|
||||
All directory changing occurs before the user's program is invoked; it
|
||||
does not wait until \fIMPI_INIT\fP is called.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Standard I/O
|
||||
.
|
||||
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||
inherits standard input from \fImpirun\fP.
|
||||
.B Note:
|
||||
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||
\fImpirun\fP's standard input to the rank 0 process.
|
||||
.PP
|
||||
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||
\fImpirun\fP.
|
||||
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||
to it directly.
|
||||
.PP
|
||||
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||
using the typical shell redirection procedure on \fImpirun\fP.
|
||||
|
||||
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||
|
||||
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||
be collected into the \fImy_output\fP file.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Signal Propagation
|
||||
.
|
||||
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||
a small number of seconds, then sending all processes in the job a
|
||||
SIGKILL.
|
||||
.
|
||||
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||
all processes in the job. Other signals are not currently propagated
|
||||
by orterun.
|
||||
.
|
||||
.
|
||||
.SS Process Termination / Signal Handling
|
||||
.
|
||||
During the run of an MPI application, if any rank dies abnormally
|
||||
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||
MPI application.
|
||||
.PP
|
||||
User signal handlers should probably avoid trying to cleanup MPI state
|
||||
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||
will notice that the process died due to a signal, it is probably not
|
||||
necessary (and safest) for the user to only clean up non-MPI state.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Process Environment
|
||||
.
|
||||
Processes in the MPI application inherit their environment from the
|
||||
Open RTE daemon upon the node on which they are running. The
|
||||
environment is typically inherited from the user's shell. On remote
|
||||
nodes, the exact environment is determined by the boot MCA module
|
||||
used. The \fIrsh\fR launch module, for example, uses either
|
||||
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||
typically executes one or more of the user's shell-setup files before
|
||||
launching the Open RTE daemon. When running dynamically linked
|
||||
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||
variable to be set, care must be taken to ensure that it is correctly
|
||||
set when booting Open MPI.
|
||||
.PP
|
||||
See the "Remote Execution" section for more details.
|
||||
.
|
||||
.
|
||||
.SS Remote Execution
|
||||
.
|
||||
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||
find executables on remote nodes (this is typically only necessary in
|
||||
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||
environments typically copy the current environment to the execution
|
||||
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||
set properly). If Open MPI was compiled with shared library support,
|
||||
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||
variable set on remote nodes as well (especially to find the shared
|
||||
libraries required to run user MPI applications).
|
||||
.PP
|
||||
However, it is not always desirable or possible to edit shell
|
||||
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||
\fI--prefix\fR option is provided for some simple configurations where
|
||||
this is not possible.
|
||||
.PP
|
||||
The \fI--prefix\fR option takes a single argument: the base directory
|
||||
on the remote node where Open MPI is installed. Open MPI will use
|
||||
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||
before executing any Open MPI or user applications. This allows
|
||||
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||
.PP
|
||||
Open MPI adds the basename of the current
|
||||
node's "bindir" (the directory where Open MPI's executables are
|
||||
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||
remote node. Similarly, Open MPI adds the basename of the current
|
||||
node's "libdir" (the directory where Open MPI's libraries are
|
||||
installed) to the prefix and uses that to set the
|
||||
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||
.TP 15
|
||||
Local bindir:
|
||||
/local/node/directory/bin
|
||||
.TP
|
||||
Local libdir:
|
||||
/local/node/directory/lib64
|
||||
.PP
|
||||
If the following command line is used:
|
||||
|
||||
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||
|
||||
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||
remote node before attempting to execute anything.
|
||||
.PP
|
||||
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||
for different values for different nodes.
|
||||
.PP
|
||||
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||
on the remote node are different than the local node (e.g., if "/lib"
|
||||
is used on the local node, but "/lib64" is used on the remote node),
|
||||
or if the installation paths are something other than a subdirectory
|
||||
under a common prefix.
|
||||
.PP
|
||||
Note that executing \fImpirun\fR via an absolute pathname is
|
||||
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||
in the absolute pathname to \fImpirun\fR. For example:
|
||||
|
||||
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fBshell$\fP mpirun --prefix /usr/local
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Exported Environment Variables
|
||||
.
|
||||
All environment variables that are named in the form OMPI_* will automatically
|
||||
be exported to new processes on the local and remote nodes.
|
||||
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||
option allows the definition of new variables, note that the parser
|
||||
for this option is currently not very sophisticated - it does not even
|
||||
understand quoted values. Users are advised to set variables in the
|
||||
environment and use \fI\-x\fP to export them; not to define them.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS MCA (Modular Component Architecture)
|
||||
.
|
||||
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||
MCA modules have direct impact on MPI programs because they allow tunable
|
||||
parameters to be set at run time (such as which BTL communication device driver
|
||||
to use, what parameters to pass to that BTL, etc.).
|
||||
.PP
|
||||
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||
passed.
|
||||
For example:
|
||||
.
|
||||
.TP 4
|
||||
mpirun -mca btl tcp,self -np 1 foo
|
||||
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||
"foo" an allocated node.
|
||||
.
|
||||
.TP
|
||||
mpirun -mca btl self -np 1 foo
|
||||
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||
allocated node.
|
||||
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||
.PP
|
||||
The \fI-mca\fP switch can be used multiple times to specify different
|
||||
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||
(",") separating them.
|
||||
.PP
|
||||
.B Note:
|
||||
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||
The same effect may be accomplished by setting corresponding environment
|
||||
variables before running \fImpirun\fP.
|
||||
The form of the environment variables that Open MPI sets are:
|
||||
|
||||
OMPI_<key>=<value>
|
||||
.PP
|
||||
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||
depends on the specific MCA module.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Examples Section
|
||||
.\" **************************
|
||||
.SH EXAMPLES
|
||||
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||
.
|
||||
.TP 4
|
||||
mpirun -np 1 prog1
|
||||
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||
executable file on each node.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 8 --byslot prog1
|
||||
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||
of MPI messages.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Diagnostics Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH DIAGNOSTICS
|
||||
.\".TP 4
|
||||
.\"Error Msg:
|
||||
.\"Description
|
||||
.
|
||||
.\" **************************
|
||||
.\" Return Value Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH RETURN VALUE
|
||||
.
|
||||
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||
internal error occurred in mpirun, the corresponding error code is returned.
|
||||
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||
return value of the rank of the process that \fImpirun\fP first notices died
|
||||
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||
be the first rank that died but is not guaranteed to be so.
|
||||
.PP
|
||||
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||
mpirun does not indicate the exit status of the ranks.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.\" .SH SEE ALSO
|
||||
.\" orted(1)
|
332
orte/tools/ortekill/ortekill.c
Обычный файл
332
orte/tools/ortekill/ortekill.c
Обычный файл
@ -0,0 +1,332 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#ifdef HAVE_LIBGEN_H
|
||||
#include <libgen.h>
|
||||
#endif
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/install_dirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/version.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/util/universe_setup_file_io.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static orte_jobid_t jobid = ORTE_JOBID_INVALID;
|
||||
static char *orterun_basename = NULL;
|
||||
|
||||
/*
|
||||
* setup globals for catching orterun command line options
|
||||
*/
|
||||
struct globals_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
bool exit;
|
||||
bool no_wait_for_job_completion;
|
||||
bool by_node;
|
||||
bool by_slot;
|
||||
bool per_node;
|
||||
bool no_oversubscribe;
|
||||
bool debugger;
|
||||
bool no_local_schedule;
|
||||
bool displaymapatlaunch;
|
||||
int num_procs;
|
||||
int exit_status;
|
||||
char *hostfile;
|
||||
char *env_val;
|
||||
char *appfile;
|
||||
char *wdir;
|
||||
char *path;
|
||||
opal_mutex_t lock;
|
||||
opal_condition_t cond;
|
||||
} orterun_globals;
|
||||
static bool globals_init = false;
|
||||
|
||||
|
||||
opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||
&orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||
&orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit" },
|
||||
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||
&orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
|
||||
/* Use an appfile */
|
||||
{ NULL, NULL, NULL, '\0', NULL, "app", 1,
|
||||
&orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide an appfile; ignore all other command line options" },
|
||||
|
||||
/* Number of processes; -c, -n, --n, -np, and --np are all
|
||||
synonyms */
|
||||
{ NULL, NULL, NULL, 'c', "np", "np", 1,
|
||||
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run" },
|
||||
{ NULL, NULL, NULL, '\0', "n", "n", 1,
|
||||
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run" },
|
||||
|
||||
/* Set a hostfile */
|
||||
{ "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
|
||||
/* Don't wait for the process to finish before exiting */
|
||||
{ NULL, NULL, NULL, '\0', "nw", "nw", 0,
|
||||
&orterun_globals.no_wait_for_job_completion, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch the processes and do not wait for their completion (i.e., let orterun complete as soon a successful launch occurs)" },
|
||||
|
||||
/* Export environment variables; potentially used multiple times,
|
||||
so it does not make sense to set into a variable */
|
||||
{ NULL, NULL, NULL, 'x', NULL, NULL, 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },
|
||||
|
||||
/* Specific mapping (C, cX, N, nX) */
|
||||
#if 0
|
||||
/* JJH --map is not currently implemented so don't advertise it until it is */
|
||||
{ NULL, NULL, NULL, '\0', NULL, "map", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Mapping of processes to nodes / CPUs" },
|
||||
#endif
|
||||
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by node" },
|
||||
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by slot (the default)" },
|
||||
{ NULL, NULL, NULL, '\0', "pernode", "pernode", 0,
|
||||
&orterun_globals.per_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"If no number of process is specified, this will cause one process per available node to be executed" },
|
||||
{ NULL, NULL, NULL, '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
||||
&orterun_globals.displaymapatlaunch, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the process map just before launch"},
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the working directory of the started processes" },
|
||||
{ NULL, NULL, NULL, '\0', "path", "path", 1,
|
||||
&orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"PATH to be used to look for executables to start processes" },
|
||||
/* These arguments can be specified multiple times */
|
||||
#if 0
|
||||
/* JMS: Removed because it's not really implemented */
|
||||
{ NULL, NULL, NULL, '\0', "arch", "arch", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Architecture to start processes on" },
|
||||
#endif
|
||||
{ NULL, NULL, NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* OSC mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "nolocal", "nolocal", 0,
|
||||
&orterun_globals.no_local_schedule, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not run any MPI applications on the local node" },
|
||||
|
||||
/* User-level debugger arguments */
|
||||
{ NULL, NULL, NULL, '\0', "tv", "tv", 0,
|
||||
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Deprecated backwards compatibility flag; synonym for \"--debug\"" },
|
||||
{ NULL, NULL, NULL, '\0', "debug", "debug", 0,
|
||||
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" },
|
||||
{ "orte", "base", "user_debugger", '\0', "debugger", "debugger", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Sequence of debuggers to search for when \"--debug\" is used" },
|
||||
|
||||
/* OpenRTE arguments */
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE" },
|
||||
|
||||
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Enable debugging of any OpenRTE daemons used by this application" },
|
||||
|
||||
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
|
||||
|
||||
{ "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not detach OpenRTE daemons used by this application" },
|
||||
|
||||
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the universe name as username@hostname:universe_name for this application" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the root for the session directory tree for orterun ONLY" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Prefix where Open MPI is installed on remote nodes" },
|
||||
{ NULL, NULL, NULL, '\0', NULL, "noprefix", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Disable automatic --prefix behavior" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
extern char** environ;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int rc;
|
||||
int id, iparam;
|
||||
|
||||
/* Setup MCA params */
|
||||
|
||||
mca_base_param_init();
|
||||
orte_register_params(false);
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
orterun_basename = opal_basename(argv[0]);
|
||||
|
||||
/* Intialize our Open RTE environment */
|
||||
/* Set the flag telling orte_init that I am NOT a
|
||||
* singleton, but am "infrastructure" - prevents setting
|
||||
* up incorrect infrastructure that only a singleton would
|
||||
* require
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:init-failure", true,
|
||||
"orte_init()", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* check for daemon flags and push them into the environment
|
||||
* since this isn't being automatically done
|
||||
*/
|
||||
id = mca_base_param_reg_int_name("orte_debug", "daemons",
|
||||
"Whether to debug the ORTE daemons or not",
|
||||
false, false, (int)false, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", "debug", "daemons");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||
orterun_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
id = mca_base_param_reg_int_name("orte", "debug",
|
||||
"Top-level ORTE debug switch",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||
orterun_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
id = mca_base_param_reg_int_name("orte_debug", "daemons_file",
|
||||
"Whether want stdout/stderr of daemons to go to a file or not",
|
||||
false, false, 0, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("orte", "debug",
|
||||
"daemons_file");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||
orterun_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
return rc;
|
||||
}
|
@ -113,7 +113,6 @@ struct globals_t {
|
||||
bool no_oversubscribe;
|
||||
bool debugger;
|
||||
bool no_local_schedule;
|
||||
bool displaymapatlaunch;
|
||||
bool reuse_daemons;
|
||||
int num_procs;
|
||||
int exit_status;
|
||||
@ -201,7 +200,7 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
||||
&orterun_globals.displaymapatlaunch, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the process map just before launch"},
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
@ -419,6 +418,20 @@ int orterun(int argc, char *argv[])
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
id = mca_base_param_reg_int_name("rmaps_base", "display_map",
|
||||
"Whether to display the process map after it is computed",
|
||||
false, false, (int)false, &iparam);
|
||||
if (iparam) {
|
||||
char *tmp = mca_base_param_environ_variable("rmaps", "base", "display_map");
|
||||
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||
orterun_basename, tmp, "1", rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* pre-condition any network transports that require it */
|
||||
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -432,14 +445,6 @@ int orterun(int argc, char *argv[])
|
||||
/* construct the list of attributes */
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
|
||||
if (orterun_globals.displaymapatlaunch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP,
|
||||
ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:attr-failed", false,
|
||||
orterun_basename, NULL, NULL, rc);
|
||||
}
|
||||
}
|
||||
|
||||
/** setup callbacks for abort signals */
|
||||
opal_signal_set(&term_handler, SIGTERM,
|
||||
abort_signal_callback, &term_handler);
|
||||
@ -858,7 +863,6 @@ static int init_globals(void)
|
||||
orterun_globals.no_oversubscribe = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.no_local_schedule = false;
|
||||
orterun_globals.displaymapatlaunch = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
orterun_globals.exit_status = 0;
|
||||
if( NULL != orterun_globals.hostfile )
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user