Need to revise the display-map-at-launch option so it is active not only for the initial launch, but applies to any subsequent comm_spawn events too.
Add placeholders for the new orte tools. These don't actually do anything yet - in fact, I have set the .ompi_ignore so that you won't compile them (I have set a .ompi_unignore for me). Please let me know if you encounter any trouble with this - the ompi_ignore's should protect everyone. This commit was SVN r12616.
Этот коммит содержится в:
родитель
5ddcb8a652
Коммит
ca5b4358fa
@ -1125,7 +1125,10 @@ AC_CONFIG_FILES([
|
|||||||
orte/etc/Makefile
|
orte/etc/Makefile
|
||||||
|
|
||||||
orte/tools/console/Makefile
|
orte/tools/console/Makefile
|
||||||
|
orte/tools/orteboot/Makefile
|
||||||
orte/tools/orted/Makefile
|
orte/tools/orted/Makefile
|
||||||
|
orte/tools/ortehalt/Makefile
|
||||||
|
orte/tools/ortekill/Makefile
|
||||||
orte/tools/orteprobe/Makefile
|
orte/tools/orteprobe/Makefile
|
||||||
orte/tools/orterun/Makefile
|
orte/tools/orterun/Makefile
|
||||||
orte/tools/wrappers/Makefile
|
orte/tools/wrappers/Makefile
|
||||||
|
@ -94,6 +94,10 @@ int orte_odls_base_open(void)
|
|||||||
orte_odls_globals.output = -1;
|
orte_odls_globals.output = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("odls_base", "sigkill_timeout",
|
||||||
|
"Time to wait for a process to die after issuing a kill signal to it",
|
||||||
|
false, false, 1, &orte_odls_globals.timeout_before_sigkill);
|
||||||
|
|
||||||
/* register the daemon cmd data type */
|
/* register the daemon cmd data type */
|
||||||
tmp = ORTE_DAEMON_CMD;
|
tmp = ORTE_DAEMON_CMD;
|
||||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_odls_pack_daemon_cmd,
|
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_odls_pack_daemon_cmd,
|
||||||
|
@ -52,6 +52,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
|||||||
opal_list_t working_attrs;
|
opal_list_t working_attrs;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID;
|
orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID;
|
||||||
|
orte_job_map_t *map;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
/* check the attributes to see if anything in the environment
|
/* check the attributes to see if anything in the environment
|
||||||
@ -200,6 +201,13 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if we wanted to display the map, now is the time to do it */
|
||||||
|
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
||||||
|
orte_rmaps.get_job_map(&map, job);
|
||||||
|
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -416,10 +416,6 @@ static int orte_rmgr_proxy_spawn_job(
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
|
||||||
orte_rmaps.get_job_map(&map, *jobid);
|
|
||||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
||||||
|
@ -372,10 +372,6 @@ static int orte_rmgr_urm_spawn_job(
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
|
||||||
orte_rmaps.get_job_map(&map, *jobid);
|
|
||||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
if (flags & ORTE_RMGR_SETUP_TRIGS) {
|
||||||
|
@ -22,7 +22,10 @@
|
|||||||
|
|
||||||
SUBDIRS += \
|
SUBDIRS += \
|
||||||
tools/console \
|
tools/console \
|
||||||
|
tools/orteboot \
|
||||||
tools/orted \
|
tools/orted \
|
||||||
|
tools/ortehalt \
|
||||||
|
tools/ortekill \
|
||||||
tools/orteprobe \
|
tools/orteprobe \
|
||||||
tools/orterun \
|
tools/orterun \
|
||||||
tools/wrappers \
|
tools/wrappers \
|
||||||
@ -31,7 +34,10 @@ SUBDIRS += \
|
|||||||
|
|
||||||
DIST_SUBDIRS += \
|
DIST_SUBDIRS += \
|
||||||
tools/console \
|
tools/console \
|
||||||
|
tools/orteboot \
|
||||||
tools/orted \
|
tools/orted \
|
||||||
|
tools/ortehalt \
|
||||||
|
tools/ortekill \
|
||||||
tools/orteprobe \
|
tools/orteprobe \
|
||||||
tools/orterun \
|
tools/orterun \
|
||||||
tools/wrappers \
|
tools/wrappers \
|
||||||
|
0
orte/tools/orteboot/.ompi_ignore
Обычный файл
0
orte/tools/orteboot/.ompi_ignore
Обычный файл
1
orte/tools/orteboot/.ompi_unignore
Обычный файл
1
orte/tools/orteboot/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
|||||||
|
rhc
|
39
orte/tools/orteboot/Makefile.am
Обычный файл
39
orte/tools/orteboot/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
libs = \
|
||||||
|
$(top_builddir)/orte/liborte.la
|
||||||
|
|
||||||
|
orteboot_SOURCES = \
|
||||||
|
orteboot.c
|
||||||
|
|
||||||
|
orteboot_LDADD = $(libs)
|
||||||
|
orteboot_DEPENDENCIES = $(libs)
|
||||||
|
|
||||||
|
if OMPI_INSTALL_BINARIES
|
||||||
|
|
||||||
|
bin_PROGRAMS = orteboot
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-orteboot.txt
|
||||||
|
|
||||||
|
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||||
|
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||||
|
man_MANS = orteboot.1
|
||||||
|
EXTRA_DIST = orteboot.1
|
||||||
|
|
||||||
|
endif
|
130
orte/tools/orteboot/help-orteboot.txt
Обычный файл
130
orte/tools/orteboot/help-orteboot.txt
Обычный файл
@ -0,0 +1,130 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English general help file for Open RTE's orterun.
|
||||||
|
#
|
||||||
|
[orterun:init-failure]
|
||||||
|
Open RTE was unable to initialize properly. The error occured while
|
||||||
|
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:usage]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Usage: %s [OPTION]... [PROGRAM]...
|
||||||
|
Start the given program using Open RTE
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:version]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:allocate-resources]
|
||||||
|
%s was unable to allocate enough resources to start your application.
|
||||||
|
This might be a transient error (too many nodes in the cluster were
|
||||||
|
unavailable at the time of the request) or a permenant error (you
|
||||||
|
requsted more nodes than exist in your cluster).
|
||||||
|
|
||||||
|
While probably only useful to Open RTE developers, the error returned
|
||||||
|
was %d.
|
||||||
|
[orterun:error-spawning]
|
||||||
|
%s was unable to start the specified application. An attempt has been
|
||||||
|
made to clean up all processes that did start. The error returned was
|
||||||
|
%d.
|
||||||
|
[orterun:appfile-not-found]
|
||||||
|
Unable to open the appfile:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Double check that this file exists and is readable.
|
||||||
|
[orterun:executable-not-specified]
|
||||||
|
No executable was specified on the %s command line.
|
||||||
|
|
||||||
|
Aborting.
|
||||||
|
[orterun:multi-apps-and-zero-np]
|
||||||
|
%s found multiple applications specified on the command line, with
|
||||||
|
at least one that failed to specify the number of processes to execute.
|
||||||
|
When specifying multiple applications, you must specify how many processes
|
||||||
|
of each to launch via the -np argument.
|
||||||
|
[orterun:nothing-to-do]
|
||||||
|
%s could not find anything to do.
|
||||||
|
|
||||||
|
It is possible that you forgot to specify how many processes to run
|
||||||
|
via the "-np" argument.
|
||||||
|
[orterun:call-failed]
|
||||||
|
%s encountered a %s call failure. This should not happen, and
|
||||||
|
usually indicates an error within the operating system itself.
|
||||||
|
Specifically, the following error occurred:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
The only other available information that may be helpful is the errno
|
||||||
|
that was returned: %d.
|
||||||
|
[orterun:environ]
|
||||||
|
%s was unable to set
|
||||||
|
%s = %s
|
||||||
|
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:precondition]
|
||||||
|
%s was unable to precondition transports
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:attr-failed]
|
||||||
|
%s was unable to define an attribute
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:proc-aborted]
|
||||||
|
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||||
|
[orterun:abnormal-exit]
|
||||||
|
WARNING: %s encountered an abnormal exit.
|
||||||
|
|
||||||
|
This means that %s exited before it received notification that all
|
||||||
|
started processes had terminated. You should double check and ensure
|
||||||
|
that there are no runaway processes still executing.
|
||||||
|
[orterun:empty-prefix]
|
||||||
|
A prefix was supplied to %s that only contained slashes.
|
||||||
|
|
||||||
|
This is a fatal error; %s will now abort. No processes were launched.
|
||||||
|
#
|
||||||
|
[debugger-mca-param-not-found]
|
||||||
|
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||||
|
be found. Please contact the Open RTE developers; this should not
|
||||||
|
happen.
|
||||||
|
#
|
||||||
|
[debugger-orte_base_user_debugger-empty]
|
||||||
|
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||||
|
no user-level debuggers have been defined. Please set this MCA
|
||||||
|
parameter to a value and try again.
|
||||||
|
#
|
||||||
|
[debugger-not-found]
|
||||||
|
A suitable debugger could not be found in your PATH. Check the values
|
||||||
|
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||||
|
debuggers that was searched.
|
||||||
|
#
|
||||||
|
[debugger-exec-failed]
|
||||||
|
%s was unable to launch the specified debugger. This is what was
|
||||||
|
launched:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Things to check:
|
||||||
|
|
||||||
|
- Ensure that the debugger is installed properly
|
||||||
|
- Ensure that the "%s" executable is in your path
|
||||||
|
- Ensure that any required licenses are available to run the debugger
|
||||||
|
#
|
||||||
|
[orterun:daemon-die]
|
||||||
|
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
|
851
orte/tools/orteboot/orteboot.1
Обычный файл
851
orte/tools/orteboot/orteboot.1
Обычный файл
@ -0,0 +1,851 @@
|
|||||||
|
.\"
|
||||||
|
.\" Man page for ORTE's orterun command
|
||||||
|
.\"
|
||||||
|
.\" .TH name section center-footer left-footer center-header
|
||||||
|
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||||
|
.\" **************************
|
||||||
|
.\" Name Section
|
||||||
|
.\" **************************
|
||||||
|
.SH NAME
|
||||||
|
.
|
||||||
|
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||||
|
other. Using any of the names will result in exactly identical behavior.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Synopsis Section
|
||||||
|
.\" **************************
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Single Process Multiple Data (SPMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ options ]
|
||||||
|
.B <program>
|
||||||
|
.R [ <args> ]
|
||||||
|
.
|
||||||
|
|
||||||
|
Multiple Instruction Multiple Data (MIMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ global_options ]
|
||||||
|
[ local_options1 ]
|
||||||
|
.B <program1>
|
||||||
|
.R [ <args1> ] :
|
||||||
|
[ local_options2 ]
|
||||||
|
.B <program2>
|
||||||
|
.R [ <args2> ] :
|
||||||
|
... :
|
||||||
|
[ local_optionsN ]
|
||||||
|
.B <programN>
|
||||||
|
.R [ <argsN> ]
|
||||||
|
.P
|
||||||
|
|
||||||
|
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||||
|
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||||
|
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||||
|
resides, minus its last subdirectory. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Quick Summary Section
|
||||||
|
.\" **************************
|
||||||
|
.SH QUICK SUMMARY
|
||||||
|
.
|
||||||
|
If you are simply looking for how to run an MPI application, you
|
||||||
|
probably want to use a command line of the following form:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||||
|
|
||||||
|
This will run X copies of \fI<program>\fR in your current run-time
|
||||||
|
environment (if running under a supported resource manager, Open MPI's
|
||||||
|
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||||
|
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||||
|
which require the use of a hostfile, or will default to running all X
|
||||||
|
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||||
|
CPU slot. See the rest of this page for more details.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Options Section
|
||||||
|
.\" **************************
|
||||||
|
.SH OPTIONS
|
||||||
|
.
|
||||||
|
.I mpirun
|
||||||
|
will send the name of the directory where it was invoked on the local
|
||||||
|
node to each of the remote nodes, and attempt to change to that
|
||||||
|
directory. See the "Current Working Directory" section below for further
|
||||||
|
details.
|
||||||
|
.\"
|
||||||
|
.\" Start options listing
|
||||||
|
.\" Indent 10 chacters from start of first column to start of second column
|
||||||
|
.TP 10
|
||||||
|
.B <args>
|
||||||
|
Pass these run-time arguments to every new process. These must always
|
||||||
|
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||||
|
\fI<args>\fP will be ignored.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B <program>
|
||||||
|
The program executable. This is identified as the first non-recognized argument
|
||||||
|
to mpirun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||||
|
Set the maximum number of aborted processes to display.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --app \fR<appfile>\fP
|
||||||
|
Provide an appfile, ignoring all other command line options.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -bynode\fR,\fP --bynode
|
||||||
|
Allocate (map) the processes by node in a round-robin scheme.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -byslot\fR,\fP --byslot
|
||||||
|
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||||
|
default.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -c \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debug\fR,\fP --debug
|
||||||
|
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||||
|
MCA parameter.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debugger\fR,\fP --debugger
|
||||||
|
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||||
|
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||||
|
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||||
|
the parameter name; \fI<value>\fP is the parameter value.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -h\fR,\fP --help
|
||||||
|
Display help for this command
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -H \fR<host1,host2,...,hostN>\fP
|
||||||
|
Synonym for \fI-host\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||||
|
List of hosts on which to invoke processes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||||
|
Provide a hostfile to use.
|
||||||
|
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||||
|
Synonym for \fI-hostfile\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -mca\fR,\fP --mca <key> <value>
|
||||||
|
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -n\fR,\fP --n \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nolocal\fR,\fP --nolocal
|
||||||
|
Do not run any copies of the launched application on the same node as
|
||||||
|
orterun is running. This option will override listing the localhost
|
||||||
|
with \fB--host\fR or any other host-specifying mechanism.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||||
|
Do not oversubscribe any nodes; error (without starting any processes)
|
||||||
|
if the requested number of processes would cause oversubscription.
|
||||||
|
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||||
|
each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -np \fR<#>\fP
|
||||||
|
Run this many copies of the program on the given nodes. This option
|
||||||
|
indicates that the specified file is an executable program and not an
|
||||||
|
application context. If no value is provided for the number of copies to
|
||||||
|
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||||
|
line), Open MPI will automatically execute a copy of the program on
|
||||||
|
each process slot (see below for description of a "process slot"). This
|
||||||
|
feature, however, can only be used in the SPMD model and will return an
|
||||||
|
error (without beginning execution of the application) otherwise.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nw\fR,\fP --nw
|
||||||
|
Launch the processes and do not wait for their completion. mpirun will
|
||||||
|
complete as soon as successful launch occurs.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -path\fR,\fP --path \fR<path>\fP
|
||||||
|
<path> that will be used when attempting to locate requested executables.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --prefix \fR<dir>\fP
|
||||||
|
Prefix directory that will be used to set the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||||
|
the target process. See the "Remote Execution" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -q\fR,\fP --quiet
|
||||||
|
Suppress informative messages from orterun during application execution.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --tmpdir \fR<dir>\fP
|
||||||
|
Set the root for the session directory tree for mpirun only.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -tv\fR,\fP --tv
|
||||||
|
Launch processes under the TotalView debugger.
|
||||||
|
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --universe \fR<username@hostname:universe_name>\fP
|
||||||
|
For this application, set the universe name as:
|
||||||
|
username@hostname:universe_name
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -v\fR,\fP --verbose
|
||||||
|
Be verbose
|
||||||
|
.TP
|
||||||
|
.B -V\fR,\fP --version
|
||||||
|
Print version number. If no other arguments are given, this will also
|
||||||
|
cause orterun to exit.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -wd \fR<dir>\fP
|
||||||
|
Change to the directory <dir> before the user's program executes.
|
||||||
|
See the "Current Working Directory" section for notes on relative paths.
|
||||||
|
.B Note:
|
||||||
|
If the \fI-wd\fP option appears both on the command line and in an
|
||||||
|
application context, the context will take precedence over the command line.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -x \fR<env>\fP
|
||||||
|
Export the specified environment variables to the remote nodes before
|
||||||
|
executing the program. Existing environment variables can be
|
||||||
|
specified (see the Examples section, below), or new variable names
|
||||||
|
specified with corresponding values. The parser for the \fI-x\fP
|
||||||
|
option is not very sophisticated; it does not even understand quoted
|
||||||
|
values. Users are advised to set variables in the environment, and
|
||||||
|
then use \fI-x\fP to export (not define) them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.P
|
||||||
|
The following options are useful for developers; they are not generally
|
||||||
|
useful to most ORTE and/or MPI users:
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -d\fR,\fP --debug-devel
|
||||||
|
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||||
|
This is not generally useful for most users.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons-file
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||||
|
output in files.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --no-daemonize
|
||||||
|
Do not detach OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Description Section
|
||||||
|
.\" **************************
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.
|
||||||
|
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||||
|
MPI. If the application is single process multiple data (SPMD), the application
|
||||||
|
can be specified on the \fImpirun\fP command line.
|
||||||
|
|
||||||
|
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||||
|
multiple programs, the set of programs and argument can be specified in one of
|
||||||
|
two ways: Extended Command Line Arguments, and Application Context.
|
||||||
|
.PP
|
||||||
|
An application context describes the MIMD program set including all arguments
|
||||||
|
in a separate file.
|
||||||
|
.\"See appcontext(5) for a description of the application context syntax.
|
||||||
|
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||||
|
command name itself. The ability to specify different options for different
|
||||||
|
instantiations of a program is another reason to use an application context.
|
||||||
|
.PP
|
||||||
|
Extended command line arguments allow for the description of the application
|
||||||
|
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||||
|
of programs and arguments. Some options are globally set across all specified
|
||||||
|
programs (e.g. --hostfile), while others are specific to a single program
|
||||||
|
(e.g. -np).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Slots
|
||||||
|
.
|
||||||
|
Open MPI uses "slots" to represent a potential location for a process.
|
||||||
|
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||||
|
that node. For performance, the community typically equates a "slot"
|
||||||
|
with a physical CPU, thus ensuring that any process assigned to that
|
||||||
|
slot has a dedicated processor. This is not, however, a requirement for
|
||||||
|
the operation of Open MPI.
|
||||||
|
.PP
|
||||||
|
Slots can be specified in hostfiles after the hostname. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host1.example.com slots=4
|
||||||
|
Indicates that there are 4 process slots on host1.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
If no slots value is specified, then Open MPI will automatically assign
|
||||||
|
a default value of "slots=1" to that host.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||||
|
MPI will obtain both the hostnames and the number of slots directly
|
||||||
|
from the resource manger. For example, if running under a SLURM job,
|
||||||
|
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||||
|
to the job as well as how many slots on each node that SLURM says
|
||||||
|
are usable - in most high-performance environments, the slots will
|
||||||
|
equate to the number of processors on the node.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When deciding where to launch processes, Open MPI will first fill up
|
||||||
|
all available slots before oversubscribing (see "Location
|
||||||
|
Nomenclature", below, for more details on the scheduling algorithms
|
||||||
|
available). Unless told otherwise, Open MPI will arbitrarily
|
||||||
|
oversubscribe nodes. For example, if the only node available is the
|
||||||
|
localhost, Open MPI will run as many processes as specified by the
|
||||||
|
-n (or one of its variants) command line option on the
|
||||||
|
localhost (although they may run quite slowly, since they'll all be
|
||||||
|
competing for CPU and other resources).
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Limits can be placed on oversubscription with the "max_slots"
|
||||||
|
attribute in the hostfile. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host2.example.com slots=4 max_slots=6
|
||||||
|
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||||
|
is limited to launching a maximum of 6 processes on host2.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host3.example.com slots=2 max_slots=2
|
||||||
|
Indicates that there are 2 process slots on host3 and that no
|
||||||
|
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||||
|
option).
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host4.example.com max_slots=2
|
||||||
|
Shorthand; same as listing "slots=2 max_slots=2".
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Note that Open MPI's support for resource managers does not currently
|
||||||
|
set the "max_slots" values for hosts. If you wish to prevent
|
||||||
|
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||||
|
option.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
In scenarios where the user wishes to launch an application across
|
||||||
|
all available slots by not providing a "-n" option on the mpirun
|
||||||
|
command line, Open MPI will launch a process on each process slot
|
||||||
|
for each host within the provided environment. For example, if a
|
||||||
|
hostfile has been provided, then Open MPI will spawn processes
|
||||||
|
on each identified host up to the "slots=x" limit if oversubscription
|
||||||
|
is not allowed. If oversubscription is allowed (the default), then
|
||||||
|
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||||
|
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||||
|
mapping directives will be enforced to ensure proper placement of
|
||||||
|
process ranks.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Location Nomenclature
|
||||||
|
.
|
||||||
|
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||||
|
the current Open MPI universe. Locations can be specified either by
|
||||||
|
CPU or by node.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||||
|
specifying a location "by CPU" is really a convenience mechanism for
|
||||||
|
SMPs that ultimately maps down to a specific node.
|
||||||
|
.PP
|
||||||
|
Specifying locations by node will launch one copy of an executable per
|
||||||
|
specified node.
|
||||||
|
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||||
|
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||||
|
node before allocating resources on the next available node.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun --bynode -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||||
|
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||||
|
are available on each of the nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --byslot -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on each slot on a given node before running the executable on other available
|
||||||
|
nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Specifying Hosts
|
||||||
|
.
|
||||||
|
Hosts can be specified in a number of ways. The most common of which is in a
|
||||||
|
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
|
||||||
|
\fBshell$\fP cat my-hostfile
|
||||||
|
node00 slots=2
|
||||||
|
node01 slots=2
|
||||||
|
node02 slots=2
|
||||||
|
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --hostfile my-hostfile -np 3 a.out
|
||||||
|
This will run one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts node00,node01, and node02.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Another method for specifying hosts is directly on the command line. Here can
|
||||||
|
can include and exclude hosts from the set of hosts to run on. For example:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host a.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a,b,c a.out
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts a, b, and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host node00.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||||
|
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||||
|
abort.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I hostname
|
||||||
|
on host a. And runs one copy of the executable
|
||||||
|
.I uptime
|
||||||
|
on hosts b and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Local Launch
|
||||||
|
.
|
||||||
|
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||||
|
launch any of the application processes on the same node that orterun
|
||||||
|
is running. While orterun typically blocks and consumes few system
|
||||||
|
resources, this option can be helpful for launching very large jobs
|
||||||
|
where orterun may actually need to use noticable amounts of memory
|
||||||
|
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||||
|
sharing the local node with the launched applications, and likewise
|
||||||
|
allows the launched applications to run unhindered by orterun's system
|
||||||
|
usage.
|
||||||
|
.PP
|
||||||
|
Note that \fB--nolocal\fR will override any other specification to
|
||||||
|
launch the application on the local node. It will disqualify the
|
||||||
|
localhost from being capable of running any processes in the
|
||||||
|
application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||||
|
This example will result in an error because orterun will not find
|
||||||
|
anywhere to launch the application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Oversubscription
|
||||||
|
.
|
||||||
|
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||||
|
set the "max_slots" value to be the same as the "slots" value for each
|
||||||
|
node. This can be especially helpful when running jobs under a
|
||||||
|
resource manager because Open MPI currently only sets the "slots"
|
||||||
|
value for each node that it obtains from the resource manager.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Application Context or Executable Program?
|
||||||
|
.
|
||||||
|
To distinguish the two different forms, \fImpirun\fP
|
||||||
|
looks on the command line for \fI--app\fP option. If
|
||||||
|
it is specified, then the file named on the command line is
|
||||||
|
assumed to be an application context. If it is not
|
||||||
|
specified, then the file is assumed to be an executable program.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Locating Files
|
||||||
|
.
|
||||||
|
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||||
|
will look for files by searching the directories in the user's PATH environment
|
||||||
|
variable as defined on the source node(s).
|
||||||
|
.PP
|
||||||
|
If a relative directory is specified, it must be relative to the initial
|
||||||
|
working directory determined by the specific starter used. For example when
|
||||||
|
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||||
|
starters may set the initial directory to the current working directory from
|
||||||
|
the invocation of \fImpirun\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Current Working Directory
|
||||||
|
.
|
||||||
|
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||||
|
directory before their program is invoked. It can also be used in application
|
||||||
|
context files to specify working directories on specific nodes and/or
|
||||||
|
for specific applications.
|
||||||
|
.PP
|
||||||
|
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||||
|
the context file directory will override the command line value.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||||
|
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||||
|
will abort.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||||
|
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||||
|
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||||
|
the directory does not exit on that node), then Open MPI will use the default
|
||||||
|
directory determined by the starter.
|
||||||
|
.PP
|
||||||
|
All directory changing occurs before the user's program is invoked; it
|
||||||
|
does not wait until \fIMPI_INIT\fP is called.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Standard I/O
|
||||||
|
.
|
||||||
|
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||||
|
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||||
|
inherits standard input from \fImpirun\fP.
|
||||||
|
.B Note:
|
||||||
|
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||||
|
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||||
|
\fImpirun\fP's standard input to the rank 0 process.
|
||||||
|
.PP
|
||||||
|
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||||
|
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||||
|
\fImpirun\fP.
|
||||||
|
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||||
|
to it directly.
|
||||||
|
.PP
|
||||||
|
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||||
|
using the typical shell redirection procedure on \fImpirun\fP.
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||||
|
|
||||||
|
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||||
|
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||||
|
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||||
|
be collected into the \fImy_output\fP file.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Signal Propagation
|
||||||
|
.
|
||||||
|
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||||
|
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||||
|
a small number of seconds, then sending all processes in the job a
|
||||||
|
SIGKILL.
|
||||||
|
.
|
||||||
|
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||||
|
all processes in the job. Other signals are not currently propagated
|
||||||
|
by orterun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Termination / Signal Handling
|
||||||
|
.
|
||||||
|
During the run of an MPI application, if any rank dies abnormally
|
||||||
|
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||||
|
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||||
|
MPI application.
|
||||||
|
.PP
|
||||||
|
User signal handlers should probably avoid trying to cleanup MPI state
|
||||||
|
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||||
|
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||||
|
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||||
|
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||||
|
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||||
|
will notice that the process died due to a signal, it is probably not
|
||||||
|
necessary (and safest) for the user to only clean up non-MPI state.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Environment
|
||||||
|
.
|
||||||
|
Processes in the MPI application inherit their environment from the
|
||||||
|
Open RTE daemon upon the node on which they are running. The
|
||||||
|
environment is typically inherited from the user's shell. On remote
|
||||||
|
nodes, the exact environment is determined by the boot MCA module
|
||||||
|
used. The \fIrsh\fR launch module, for example, uses either
|
||||||
|
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||||
|
typically executes one or more of the user's shell-setup files before
|
||||||
|
launching the Open RTE daemon. When running dynamically linked
|
||||||
|
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable to be set, care must be taken to ensure that it is correctly
|
||||||
|
set when booting Open MPI.
|
||||||
|
.PP
|
||||||
|
See the "Remote Execution" section for more details.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Remote Execution
|
||||||
|
.
|
||||||
|
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||||
|
find executables on remote nodes (this is typically only necessary in
|
||||||
|
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||||
|
environments typically copy the current environment to the execution
|
||||||
|
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||||
|
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||||
|
set properly). If Open MPI was compiled with shared library support,
|
||||||
|
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable set on remote nodes as well (especially to find the shared
|
||||||
|
libraries required to run user MPI applications).
|
||||||
|
.PP
|
||||||
|
However, it is not always desirable or possible to edit shell
|
||||||
|
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||||
|
\fI--prefix\fR option is provided for some simple configurations where
|
||||||
|
this is not possible.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option takes a single argument: the base directory
|
||||||
|
on the remote node where Open MPI is installed. Open MPI will use
|
||||||
|
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||||
|
before executing any Open MPI or user applications. This allows
|
||||||
|
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||||
|
.PP
|
||||||
|
Open MPI adds the basename of the current
|
||||||
|
node's "bindir" (the directory where Open MPI's executables are
|
||||||
|
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||||
|
remote node. Similarly, Open MPI adds the basename of the current
|
||||||
|
node's "libdir" (the directory where Open MPI's libraries are
|
||||||
|
installed) to the prefix and uses that to set the
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||||
|
.TP 15
|
||||||
|
Local bindir:
|
||||||
|
/local/node/directory/bin
|
||||||
|
.TP
|
||||||
|
Local libdir:
|
||||||
|
/local/node/directory/lib64
|
||||||
|
.PP
|
||||||
|
If the following command line is used:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||||
|
|
||||||
|
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||||
|
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||||
|
remote node before attempting to execute anything.
|
||||||
|
.PP
|
||||||
|
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||||
|
for different values for different nodes.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||||
|
on the remote node are different than the local node (e.g., if "/lib"
|
||||||
|
is used on the local node, but "/lib64" is used on the remote node),
|
||||||
|
or if the installation paths are something other than a subdirectory
|
||||||
|
under a common prefix.
|
||||||
|
.PP
|
||||||
|
Note that executing \fImpirun\fR via an absolute pathname is
|
||||||
|
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||||
|
in the absolute pathname to \fImpirun\fR. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Exported Environment Variables
|
||||||
|
.
|
||||||
|
All environment variables that are named in the form OMPI_* will automatically
|
||||||
|
be exported to new processes on the local and remote nodes.
|
||||||
|
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||||
|
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||||
|
option allows the definition of new variables, note that the parser
|
||||||
|
for this option is currently not very sophisticated - it does not even
|
||||||
|
understand quoted values. Users are advised to set variables in the
|
||||||
|
environment and use \fI\-x\fP to export them; not to define them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS MCA (Modular Component Architecture)
|
||||||
|
.
|
||||||
|
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||||
|
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||||
|
MCA modules have direct impact on MPI programs because they allow tunable
|
||||||
|
parameters to be set at run time (such as which BTL communication device driver
|
||||||
|
to use, what parameters to pass to that BTL, etc.).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||||
|
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||||
|
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||||
|
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||||
|
passed.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -mca btl tcp,self -np 1 foo
|
||||||
|
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||||
|
"foo" an allocated node.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -mca btl self -np 1 foo
|
||||||
|
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||||
|
allocated node.
|
||||||
|
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch can be used multiple times to specify different
|
||||||
|
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||||
|
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||||
|
(",") separating them.
|
||||||
|
.PP
|
||||||
|
.B Note:
|
||||||
|
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||||
|
The same effect may be accomplished by setting corresponding environment
|
||||||
|
variables before running \fImpirun\fP.
|
||||||
|
The form of the environment variables that Open MPI sets are:
|
||||||
|
|
||||||
|
OMPI_<key>=<value>
|
||||||
|
.PP
|
||||||
|
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||||
|
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||||
|
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||||
|
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||||
|
depends on the specific MCA module.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Examples Section
|
||||||
|
.\" **************************
|
||||||
|
.SH EXAMPLES
|
||||||
|
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -np 1 prog1
|
||||||
|
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||||
|
executable file on each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 8 --byslot prog1
|
||||||
|
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||||
|
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||||
|
of MPI messages.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Diagnostics Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH DIAGNOSTICS
|
||||||
|
.\".TP 4
|
||||||
|
.\"Error Msg:
|
||||||
|
.\"Description
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Return Value Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.SH RETURN VALUE
|
||||||
|
.
|
||||||
|
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||||
|
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||||
|
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||||
|
internal error occurred in mpirun, the corresponding error code is returned.
|
||||||
|
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||||
|
return value of the rank of the process that \fImpirun\fP first notices died
|
||||||
|
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||||
|
be the first rank that died but is not guaranteed to be so.
|
||||||
|
.PP
|
||||||
|
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||||
|
mpirun does not indicate the exit status of the ranks.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" See Also Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH SEE ALSO
|
||||||
|
.\" orted(1)
|
348
orte/tools/orteboot/orteboot.c
Обычный файл
348
orte/tools/orteboot/orteboot.c
Обычный файл
@ -0,0 +1,348 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_PARAM_H
|
||||||
|
#include <sys/param.h>
|
||||||
|
#endif
|
||||||
|
#include <errno.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif /* HAVE_SYS_TYPES_H */
|
||||||
|
#ifdef HAVE_SYS_WAIT_H
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#endif /* HAVE_SYS_WAIT_H */
|
||||||
|
#ifdef HAVE_LIBGEN_H
|
||||||
|
#include <libgen.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/event/event.h"
|
||||||
|
#include "opal/install_dirs.h"
|
||||||
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/threads/condition.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
|
#include "opal/util/cmd_line.h"
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/trace.h"
|
||||||
|
#include "opal/version.h"
|
||||||
|
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
|
||||||
|
#include "orte/class/orte_pointer_array.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/sys_info.h"
|
||||||
|
#include "orte/util/universe_setup_file_io.h"
|
||||||
|
#include "orte/util/pre_condition_transports.h"
|
||||||
|
|
||||||
|
#include "orte/mca/ns/ns.h"
|
||||||
|
#include "orte/mca/gpr/gpr.h"
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "orte/mca/schema/schema.h"
|
||||||
|
#include "orte/mca/smr/smr.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/runtime.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Globals
|
||||||
|
*/
|
||||||
|
static orte_jobid_t jobid = ORTE_JOBID_INVALID;
|
||||||
|
static char *orteboot_basename = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* setup globals for catching orteboot command line options
|
||||||
|
*/
|
||||||
|
struct globals_t {
|
||||||
|
bool help;
|
||||||
|
bool version;
|
||||||
|
bool verbose;
|
||||||
|
bool quiet;
|
||||||
|
bool exit;
|
||||||
|
char *hostfile;
|
||||||
|
char *wdir;
|
||||||
|
opal_mutex_t lock;
|
||||||
|
opal_condition_t cond;
|
||||||
|
} orteboot_globals;
|
||||||
|
|
||||||
|
|
||||||
|
opal_cmd_line_init_t cmd_line_init[] = {
|
||||||
|
/* Various "obvious" options */
|
||||||
|
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||||
|
&orteboot_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"This help message" },
|
||||||
|
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||||
|
&orteboot_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Print version and exit" },
|
||||||
|
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||||
|
&orteboot_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Be verbose" },
|
||||||
|
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||||
|
&orteboot_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Suppress helpful messages" },
|
||||||
|
|
||||||
|
/* Set a hostfile */
|
||||||
|
{ "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Provide a hostfile" },
|
||||||
|
{ "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Provide a hostfile" },
|
||||||
|
|
||||||
|
/* mpiexec-like arguments */
|
||||||
|
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||||
|
&orteboot_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the working directory of the started processes" },
|
||||||
|
|
||||||
|
/* These arguments can be specified multiple times */
|
||||||
|
{ NULL, NULL, NULL, 'H', "host", "host", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"List of hosts to invoke processes on" },
|
||||||
|
|
||||||
|
/* OpenRTE arguments */
|
||||||
|
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable debugging of OpenRTE" },
|
||||||
|
|
||||||
|
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Enable debugging of any OpenRTE daemons used by this application" },
|
||||||
|
|
||||||
|
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
|
||||||
|
|
||||||
|
{ "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Do not detach OpenRTE daemons used by this application" },
|
||||||
|
|
||||||
|
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the universe name as username@hostname:universe_name for this application" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||||
|
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the root for the session directory tree for orteboot ONLY" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Prefix where Open MPI is installed on remote nodes" },
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "noprefix", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Disable automatic --prefix behavior" },
|
||||||
|
|
||||||
|
/* End of list */
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
#if !defined(__WINDOWS__)
|
||||||
|
extern char** environ;
|
||||||
|
#endif /* !defined(__WINDOWS__) */
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
orte_app_context_t *app;
|
||||||
|
int rc, ret;
|
||||||
|
int id, iparam;
|
||||||
|
opal_list_t attributes;
|
||||||
|
opal_cmd_line_t cmd_line;
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&orteboot_globals.lock, opal_mutex_t);
|
||||||
|
OBJ_CONSTRUCT(&orteboot_globals.cond, opal_condition_t);
|
||||||
|
orteboot_globals.hostfile = NULL;
|
||||||
|
orteboot_globals.wdir = NULL;
|
||||||
|
orteboot_globals.help = false;
|
||||||
|
orteboot_globals.version = false;
|
||||||
|
orteboot_globals.verbose = false;
|
||||||
|
orteboot_globals.exit = false;
|
||||||
|
|
||||||
|
/* Setup MCA params */
|
||||||
|
mca_base_param_init();
|
||||||
|
|
||||||
|
/* find our basename (the name of the executable) so that we can
|
||||||
|
* use it in pretty-print error messages
|
||||||
|
*/
|
||||||
|
orteboot_basename = opal_basename(argv[0]);
|
||||||
|
|
||||||
|
/* Setup and parse the command line */
|
||||||
|
opal_cmd_line_create(&cmd_line, cmd_line_init);
|
||||||
|
mca_base_cmd_line_setup(&cmd_line);
|
||||||
|
if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(&cmd_line, true,
|
||||||
|
argc, argv))) {
|
||||||
|
char *args = NULL;
|
||||||
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:usage", false,
|
||||||
|
argv[0], args);
|
||||||
|
free(args);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* print version if requested. Do this before check for help so
|
||||||
|
that --version --help works as one might expect. */
|
||||||
|
if (orteboot_globals.version &&
|
||||||
|
!(1 == argc || orteboot_globals.help)) {
|
||||||
|
char *project_name = NULL;
|
||||||
|
if (0 == strcmp(orteboot_basename, "ompiboot")) {
|
||||||
|
project_name = "Open MPI";
|
||||||
|
} else {
|
||||||
|
project_name = "OpenRTE";
|
||||||
|
}
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:version", false,
|
||||||
|
orteboot_basename, project_name, OPAL_VERSION,
|
||||||
|
PACKAGE_BUGREPORT);
|
||||||
|
/* if we were the only argument, exit */
|
||||||
|
if (2 == argc) exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check for help request */
|
||||||
|
if (1 == argc || orteboot_globals.help) {
|
||||||
|
char *args = NULL;
|
||||||
|
char *project_name = NULL;
|
||||||
|
if (0 == strcmp(orteboot_basename, "ompiboot")) {
|
||||||
|
project_name = "Open MPI";
|
||||||
|
} else {
|
||||||
|
project_name = "OpenRTE";
|
||||||
|
}
|
||||||
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:usage", false,
|
||||||
|
orteboot_basename, project_name, OPAL_VERSION,
|
||||||
|
orteboot_basename, args,
|
||||||
|
PACKAGE_BUGREPORT);
|
||||||
|
free(args);
|
||||||
|
|
||||||
|
/* If someone asks for help, that should be all we do */
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check for daemon flags and push them into the environment
|
||||||
|
* since this isn't being automatically done
|
||||||
|
*/
|
||||||
|
id = mca_base_param_reg_int_name("orte_debug", "daemons",
|
||||||
|
"Whether to debug the ORTE daemons or not",
|
||||||
|
false, false, (int)false, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", "debug", "daemons");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||||
|
orteboot_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
id = mca_base_param_reg_int_name("orte", "debug",
|
||||||
|
"Top-level ORTE debug switch",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||||
|
orteboot_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
id = mca_base_param_reg_int_name("orte_debug", "daemons_file",
|
||||||
|
"Whether want stdout/stderr of daemons to go to a file or not",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", "debug",
|
||||||
|
"daemons_file");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||||
|
orteboot_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
id = mca_base_param_reg_int_name("orte", "no_daemonize",
|
||||||
|
"Whether to properly daemonize the ORTE daemons or not",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", "no_daemonize", NULL);
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:environ", false,
|
||||||
|
orteboot_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Intialize our Open RTE environment */
|
||||||
|
/* Set the flag telling orte_init that I am NOT a
|
||||||
|
* singleton, but am "infrastructure" - prevents setting
|
||||||
|
* up incorrect infrastructure that only a singleton would
|
||||||
|
* require
|
||||||
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:init-failure", true,
|
||||||
|
"orte_init()", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prep to start the virtual machine */
|
||||||
|
/* construct the list of attributes */
|
||||||
|
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||||
|
|
||||||
|
orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_PERNODE, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE);
|
||||||
|
|
||||||
|
/* Create the app - in this case, that's just a no_op to get the daemons launched */
|
||||||
|
app = OBJ_NEW(orte_app_context_t);
|
||||||
|
if (NULL == app) {
|
||||||
|
opal_show_help("help-orteboot.txt", "orteboot:call-failed",
|
||||||
|
true, orteboot_basename, "system", "malloc returned NULL", errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Spawn the job */
|
||||||
|
|
||||||
|
rc = orte_rmgr.spawn_job(&app, 1, &jobid, 0, NULL, NULL, 0, &attributes);
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
/* JMS show_help */
|
||||||
|
opal_output(0, "%s: spawn failed with errno=%d\n", orteboot_basename, rc);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&attributes);
|
||||||
|
|
||||||
|
|
||||||
|
orte_finalize();
|
||||||
|
free(orteboot_basename);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
0
orte/tools/ortehalt/.ompi_ignore
Обычный файл
0
orte/tools/ortehalt/.ompi_ignore
Обычный файл
1
orte/tools/ortehalt/.ompi_unignore
Обычный файл
1
orte/tools/ortehalt/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
|||||||
|
rhc
|
39
orte/tools/ortehalt/Makefile.am
Обычный файл
39
orte/tools/ortehalt/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
libs = \
|
||||||
|
$(top_builddir)/orte/liborte.la
|
||||||
|
|
||||||
|
ortehalt_SOURCES = \
|
||||||
|
ortehalt.c
|
||||||
|
|
||||||
|
ortehalt_LDADD = $(libs)
|
||||||
|
ortehalt_DEPENDENCIES = $(libs)
|
||||||
|
|
||||||
|
if OMPI_INSTALL_BINARIES
|
||||||
|
|
||||||
|
bin_PROGRAMS = ortehalt
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-ortehalt.txt
|
||||||
|
|
||||||
|
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||||
|
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||||
|
man_MANS = ortehalt.1
|
||||||
|
EXTRA_DIST = ortehalt.1
|
||||||
|
|
||||||
|
endif
|
130
orte/tools/ortehalt/help-ortehalt.txt
Обычный файл
130
orte/tools/ortehalt/help-ortehalt.txt
Обычный файл
@ -0,0 +1,130 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English general help file for Open RTE's orterun.
|
||||||
|
#
|
||||||
|
[orterun:init-failure]
|
||||||
|
Open RTE was unable to initialize properly. The error occured while
|
||||||
|
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:usage]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Usage: %s [OPTION]... [PROGRAM]...
|
||||||
|
Start the given program using Open RTE
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:version]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:allocate-resources]
|
||||||
|
%s was unable to allocate enough resources to start your application.
|
||||||
|
This might be a transient error (too many nodes in the cluster were
|
||||||
|
unavailable at the time of the request) or a permenant error (you
|
||||||
|
requsted more nodes than exist in your cluster).
|
||||||
|
|
||||||
|
While probably only useful to Open RTE developers, the error returned
|
||||||
|
was %d.
|
||||||
|
[orterun:error-spawning]
|
||||||
|
%s was unable to start the specified application. An attempt has been
|
||||||
|
made to clean up all processes that did start. The error returned was
|
||||||
|
%d.
|
||||||
|
[orterun:appfile-not-found]
|
||||||
|
Unable to open the appfile:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Double check that this file exists and is readable.
|
||||||
|
[orterun:executable-not-specified]
|
||||||
|
No executable was specified on the %s command line.
|
||||||
|
|
||||||
|
Aborting.
|
||||||
|
[orterun:multi-apps-and-zero-np]
|
||||||
|
%s found multiple applications specified on the command line, with
|
||||||
|
at least one that failed to specify the number of processes to execute.
|
||||||
|
When specifying multiple applications, you must specify how many processes
|
||||||
|
of each to launch via the -np argument.
|
||||||
|
[orterun:nothing-to-do]
|
||||||
|
%s could not find anything to do.
|
||||||
|
|
||||||
|
It is possible that you forgot to specify how many processes to run
|
||||||
|
via the "-np" argument.
|
||||||
|
[orterun:call-failed]
|
||||||
|
%s encountered a %s call failure. This should not happen, and
|
||||||
|
usually indicates an error within the operating system itself.
|
||||||
|
Specifically, the following error occurred:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
The only other available information that may be helpful is the errno
|
||||||
|
that was returned: %d.
|
||||||
|
[orterun:environ]
|
||||||
|
%s was unable to set
|
||||||
|
%s = %s
|
||||||
|
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:precondition]
|
||||||
|
%s was unable to precondition transports
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:attr-failed]
|
||||||
|
%s was unable to define an attribute
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:proc-aborted]
|
||||||
|
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||||
|
[orterun:abnormal-exit]
|
||||||
|
WARNING: %s encountered an abnormal exit.
|
||||||
|
|
||||||
|
This means that %s exited before it received notification that all
|
||||||
|
started processes had terminated. You should double check and ensure
|
||||||
|
that there are no runaway processes still executing.
|
||||||
|
[orterun:empty-prefix]
|
||||||
|
A prefix was supplied to %s that only contained slashes.
|
||||||
|
|
||||||
|
This is a fatal error; %s will now abort. No processes were launched.
|
||||||
|
#
|
||||||
|
[debugger-mca-param-not-found]
|
||||||
|
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||||
|
be found. Please contact the Open RTE developers; this should not
|
||||||
|
happen.
|
||||||
|
#
|
||||||
|
[debugger-orte_base_user_debugger-empty]
|
||||||
|
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||||
|
no user-level debuggers have been defined. Please set this MCA
|
||||||
|
parameter to a value and try again.
|
||||||
|
#
|
||||||
|
[debugger-not-found]
|
||||||
|
A suitable debugger could not be found in your PATH. Check the values
|
||||||
|
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||||
|
debuggers that was searched.
|
||||||
|
#
|
||||||
|
[debugger-exec-failed]
|
||||||
|
%s was unable to launch the specified debugger. This is what was
|
||||||
|
launched:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Things to check:
|
||||||
|
|
||||||
|
- Ensure that the debugger is installed properly
|
||||||
|
- Ensure that the "%s" executable is in your path
|
||||||
|
- Ensure that any required licenses are available to run the debugger
|
||||||
|
#
|
||||||
|
[orterun:daemon-die]
|
||||||
|
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
|
851
orte/tools/ortehalt/ortehalt.1
Обычный файл
851
orte/tools/ortehalt/ortehalt.1
Обычный файл
@ -0,0 +1,851 @@
|
|||||||
|
.\"
|
||||||
|
.\" Man page for ORTE's orterun command
|
||||||
|
.\"
|
||||||
|
.\" .TH name section center-footer left-footer center-header
|
||||||
|
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||||
|
.\" **************************
|
||||||
|
.\" Name Section
|
||||||
|
.\" **************************
|
||||||
|
.SH NAME
|
||||||
|
.
|
||||||
|
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||||
|
other. Using any of the names will result in exactly identical behavior.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Synopsis Section
|
||||||
|
.\" **************************
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Single Process Multiple Data (SPMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ options ]
|
||||||
|
.B <program>
|
||||||
|
.R [ <args> ]
|
||||||
|
.
|
||||||
|
|
||||||
|
Multiple Instruction Multiple Data (MIMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ global_options ]
|
||||||
|
[ local_options1 ]
|
||||||
|
.B <program1>
|
||||||
|
.R [ <args1> ] :
|
||||||
|
[ local_options2 ]
|
||||||
|
.B <program2>
|
||||||
|
.R [ <args2> ] :
|
||||||
|
... :
|
||||||
|
[ local_optionsN ]
|
||||||
|
.B <programN>
|
||||||
|
.R [ <argsN> ]
|
||||||
|
.P
|
||||||
|
|
||||||
|
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||||
|
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||||
|
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||||
|
resides, minus its last subdirectory. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Quick Summary Section
|
||||||
|
.\" **************************
|
||||||
|
.SH QUICK SUMMARY
|
||||||
|
.
|
||||||
|
If you are simply looking for how to run an MPI application, you
|
||||||
|
probably want to use a command line of the following form:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||||
|
|
||||||
|
This will run X copies of \fI<program>\fR in your current run-time
|
||||||
|
environment (if running under a supported resource manager, Open MPI's
|
||||||
|
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||||
|
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||||
|
which require the use of a hostfile, or will default to running all X
|
||||||
|
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||||
|
CPU slot. See the rest of this page for more details.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Options Section
|
||||||
|
.\" **************************
|
||||||
|
.SH OPTIONS
|
||||||
|
.
|
||||||
|
.I mpirun
|
||||||
|
will send the name of the directory where it was invoked on the local
|
||||||
|
node to each of the remote nodes, and attempt to change to that
|
||||||
|
directory. See the "Current Working Directory" section below for further
|
||||||
|
details.
|
||||||
|
.\"
|
||||||
|
.\" Start options listing
|
||||||
|
.\" Indent 10 chacters from start of first column to start of second column
|
||||||
|
.TP 10
|
||||||
|
.B <args>
|
||||||
|
Pass these run-time arguments to every new process. These must always
|
||||||
|
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||||
|
\fI<args>\fP will be ignored.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B <program>
|
||||||
|
The program executable. This is identified as the first non-recognized argument
|
||||||
|
to mpirun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||||
|
Set the maximum number of aborted processes to display.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --app \fR<appfile>\fP
|
||||||
|
Provide an appfile, ignoring all other command line options.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -bynode\fR,\fP --bynode
|
||||||
|
Allocate (map) the processes by node in a round-robin scheme.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -byslot\fR,\fP --byslot
|
||||||
|
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||||
|
default.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -c \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debug\fR,\fP --debug
|
||||||
|
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||||
|
MCA parameter.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debugger\fR,\fP --debugger
|
||||||
|
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||||
|
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||||
|
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||||
|
the parameter name; \fI<value>\fP is the parameter value.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -h\fR,\fP --help
|
||||||
|
Display help for this command
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -H \fR<host1,host2,...,hostN>\fP
|
||||||
|
Synonym for \fI-host\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||||
|
List of hosts on which to invoke processes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||||
|
Provide a hostfile to use.
|
||||||
|
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||||
|
Synonym for \fI-hostfile\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -mca\fR,\fP --mca <key> <value>
|
||||||
|
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -n\fR,\fP --n \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nolocal\fR,\fP --nolocal
|
||||||
|
Do not run any copies of the launched application on the same node as
|
||||||
|
orterun is running. This option will override listing the localhost
|
||||||
|
with \fB--host\fR or any other host-specifying mechanism.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||||
|
Do not oversubscribe any nodes; error (without starting any processes)
|
||||||
|
if the requested number of processes would cause oversubscription.
|
||||||
|
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||||
|
each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -np \fR<#>\fP
|
||||||
|
Run this many copies of the program on the given nodes. This option
|
||||||
|
indicates that the specified file is an executable program and not an
|
||||||
|
application context. If no value is provided for the number of copies to
|
||||||
|
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||||
|
line), Open MPI will automatically execute a copy of the program on
|
||||||
|
each process slot (see below for description of a "process slot"). This
|
||||||
|
feature, however, can only be used in the SPMD model and will return an
|
||||||
|
error (without beginning execution of the application) otherwise.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nw\fR,\fP --nw
|
||||||
|
Launch the processes and do not wait for their completion. mpirun will
|
||||||
|
complete as soon as successful launch occurs.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -path\fR,\fP --path \fR<path>\fP
|
||||||
|
<path> that will be used when attempting to locate requested executables.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --prefix \fR<dir>\fP
|
||||||
|
Prefix directory that will be used to set the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||||
|
the target process. See the "Remote Execution" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -q\fR,\fP --quiet
|
||||||
|
Suppress informative messages from orterun during application execution.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --tmpdir \fR<dir>\fP
|
||||||
|
Set the root for the session directory tree for mpirun only.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -tv\fR,\fP --tv
|
||||||
|
Launch processes under the TotalView debugger.
|
||||||
|
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --universe \fR<username@hostname:universe_name>\fP
|
||||||
|
For this application, set the universe name as:
|
||||||
|
username@hostname:universe_name
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -v\fR,\fP --verbose
|
||||||
|
Be verbose
|
||||||
|
.TP
|
||||||
|
.B -V\fR,\fP --version
|
||||||
|
Print version number. If no other arguments are given, this will also
|
||||||
|
cause orterun to exit.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -wd \fR<dir>\fP
|
||||||
|
Change to the directory <dir> before the user's program executes.
|
||||||
|
See the "Current Working Directory" section for notes on relative paths.
|
||||||
|
.B Note:
|
||||||
|
If the \fI-wd\fP option appears both on the command line and in an
|
||||||
|
application context, the context will take precedence over the command line.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -x \fR<env>\fP
|
||||||
|
Export the specified environment variables to the remote nodes before
|
||||||
|
executing the program. Existing environment variables can be
|
||||||
|
specified (see the Examples section, below), or new variable names
|
||||||
|
specified with corresponding values. The parser for the \fI-x\fP
|
||||||
|
option is not very sophisticated; it does not even understand quoted
|
||||||
|
values. Users are advised to set variables in the environment, and
|
||||||
|
then use \fI-x\fP to export (not define) them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.P
|
||||||
|
The following options are useful for developers; they are not generally
|
||||||
|
useful to most ORTE and/or MPI users:
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -d\fR,\fP --debug-devel
|
||||||
|
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||||
|
This is not generally useful for most users.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons-file
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||||
|
output in files.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --no-daemonize
|
||||||
|
Do not detach OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Description Section
|
||||||
|
.\" **************************
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.
|
||||||
|
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||||
|
MPI. If the application is single process multiple data (SPMD), the application
|
||||||
|
can be specified on the \fImpirun\fP command line.
|
||||||
|
|
||||||
|
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||||
|
multiple programs, the set of programs and argument can be specified in one of
|
||||||
|
two ways: Extended Command Line Arguments, and Application Context.
|
||||||
|
.PP
|
||||||
|
An application context describes the MIMD program set including all arguments
|
||||||
|
in a separate file.
|
||||||
|
.\"See appcontext(5) for a description of the application context syntax.
|
||||||
|
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||||
|
command name itself. The ability to specify different options for different
|
||||||
|
instantiations of a program is another reason to use an application context.
|
||||||
|
.PP
|
||||||
|
Extended command line arguments allow for the description of the application
|
||||||
|
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||||
|
of programs and arguments. Some options are globally set across all specified
|
||||||
|
programs (e.g. --hostfile), while others are specific to a single program
|
||||||
|
(e.g. -np).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Slots
|
||||||
|
.
|
||||||
|
Open MPI uses "slots" to represent a potential location for a process.
|
||||||
|
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||||
|
that node. For performance, the community typically equates a "slot"
|
||||||
|
with a physical CPU, thus ensuring that any process assigned to that
|
||||||
|
slot has a dedicated processor. This is not, however, a requirement for
|
||||||
|
the operation of Open MPI.
|
||||||
|
.PP
|
||||||
|
Slots can be specified in hostfiles after the hostname. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host1.example.com slots=4
|
||||||
|
Indicates that there are 4 process slots on host1.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
If no slots value is specified, then Open MPI will automatically assign
|
||||||
|
a default value of "slots=1" to that host.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||||
|
MPI will obtain both the hostnames and the number of slots directly
|
||||||
|
from the resource manger. For example, if running under a SLURM job,
|
||||||
|
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||||
|
to the job as well as how many slots on each node that SLURM says
|
||||||
|
are usable - in most high-performance environments, the slots will
|
||||||
|
equate to the number of processors on the node.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When deciding where to launch processes, Open MPI will first fill up
|
||||||
|
all available slots before oversubscribing (see "Location
|
||||||
|
Nomenclature", below, for more details on the scheduling algorithms
|
||||||
|
available). Unless told otherwise, Open MPI will arbitrarily
|
||||||
|
oversubscribe nodes. For example, if the only node available is the
|
||||||
|
localhost, Open MPI will run as many processes as specified by the
|
||||||
|
-n (or one of its variants) command line option on the
|
||||||
|
localhost (although they may run quite slowly, since they'll all be
|
||||||
|
competing for CPU and other resources).
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Limits can be placed on oversubscription with the "max_slots"
|
||||||
|
attribute in the hostfile. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host2.example.com slots=4 max_slots=6
|
||||||
|
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||||
|
is limited to launching a maximum of 6 processes on host2.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host3.example.com slots=2 max_slots=2
|
||||||
|
Indicates that there are 2 process slots on host3 and that no
|
||||||
|
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||||
|
option).
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host4.example.com max_slots=2
|
||||||
|
Shorthand; same as listing "slots=2 max_slots=2".
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Note that Open MPI's support for resource managers does not currently
|
||||||
|
set the "max_slots" values for hosts. If you wish to prevent
|
||||||
|
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||||
|
option.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
In scenarios where the user wishes to launch an application across
|
||||||
|
all available slots by not providing a "-n" option on the mpirun
|
||||||
|
command line, Open MPI will launch a process on each process slot
|
||||||
|
for each host within the provided environment. For example, if a
|
||||||
|
hostfile has been provided, then Open MPI will spawn processes
|
||||||
|
on each identified host up to the "slots=x" limit if oversubscription
|
||||||
|
is not allowed. If oversubscription is allowed (the default), then
|
||||||
|
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||||
|
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||||
|
mapping directives will be enforced to ensure proper placement of
|
||||||
|
process ranks.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Location Nomenclature
|
||||||
|
.
|
||||||
|
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||||
|
the current Open MPI universe. Locations can be specified either by
|
||||||
|
CPU or by node.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||||
|
specifying a location "by CPU" is really a convenience mechanism for
|
||||||
|
SMPs that ultimately maps down to a specific node.
|
||||||
|
.PP
|
||||||
|
Specifying locations by node will launch one copy of an executable per
|
||||||
|
specified node.
|
||||||
|
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||||
|
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||||
|
node before allocating resources on the next available node.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun --bynode -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||||
|
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||||
|
are available on each of the nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --byslot -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on each slot on a given node before running the executable on other available
|
||||||
|
nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Specifying Hosts
|
||||||
|
.
|
||||||
|
Hosts can be specified in a number of ways. The most common of which is in a
|
||||||
|
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
|
||||||
|
\fBshell$\fP cat my-hostfile
|
||||||
|
node00 slots=2
|
||||||
|
node01 slots=2
|
||||||
|
node02 slots=2
|
||||||
|
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --hostfile my-hostfile -np 3 a.out
|
||||||
|
This will run one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts node00,node01, and node02.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Another method for specifying hosts is directly on the command line. Here can
|
||||||
|
can include and exclude hosts from the set of hosts to run on. For example:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host a.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a,b,c a.out
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts a, b, and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host node00.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||||
|
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||||
|
abort.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I hostname
|
||||||
|
on host a. And runs one copy of the executable
|
||||||
|
.I uptime
|
||||||
|
on hosts b and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Local Launch
|
||||||
|
.
|
||||||
|
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||||
|
launch any of the application processes on the same node that orterun
|
||||||
|
is running. While orterun typically blocks and consumes few system
|
||||||
|
resources, this option can be helpful for launching very large jobs
|
||||||
|
where orterun may actually need to use noticable amounts of memory
|
||||||
|
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||||
|
sharing the local node with the launched applications, and likewise
|
||||||
|
allows the launched applications to run unhindered by orterun's system
|
||||||
|
usage.
|
||||||
|
.PP
|
||||||
|
Note that \fB--nolocal\fR will override any other specification to
|
||||||
|
launch the application on the local node. It will disqualify the
|
||||||
|
localhost from being capable of running any processes in the
|
||||||
|
application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||||
|
This example will result in an error because orterun will not find
|
||||||
|
anywhere to launch the application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Oversubscription
|
||||||
|
.
|
||||||
|
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||||
|
set the "max_slots" value to be the same as the "slots" value for each
|
||||||
|
node. This can be especially helpful when running jobs under a
|
||||||
|
resource manager because Open MPI currently only sets the "slots"
|
||||||
|
value for each node that it obtains from the resource manager.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Application Context or Executable Program?
|
||||||
|
.
|
||||||
|
To distinguish the two different forms, \fImpirun\fP
|
||||||
|
looks on the command line for \fI--app\fP option. If
|
||||||
|
it is specified, then the file named on the command line is
|
||||||
|
assumed to be an application context. If it is not
|
||||||
|
specified, then the file is assumed to be an executable program.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Locating Files
|
||||||
|
.
|
||||||
|
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||||
|
will look for files by searching the directories in the user's PATH environment
|
||||||
|
variable as defined on the source node(s).
|
||||||
|
.PP
|
||||||
|
If a relative directory is specified, it must be relative to the initial
|
||||||
|
working directory determined by the specific starter used. For example when
|
||||||
|
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||||
|
starters may set the initial directory to the current working directory from
|
||||||
|
the invocation of \fImpirun\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Current Working Directory
|
||||||
|
.
|
||||||
|
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||||
|
directory before their program is invoked. It can also be used in application
|
||||||
|
context files to specify working directories on specific nodes and/or
|
||||||
|
for specific applications.
|
||||||
|
.PP
|
||||||
|
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||||
|
the context file directory will override the command line value.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||||
|
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||||
|
will abort.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||||
|
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||||
|
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||||
|
the directory does not exit on that node), then Open MPI will use the default
|
||||||
|
directory determined by the starter.
|
||||||
|
.PP
|
||||||
|
All directory changing occurs before the user's program is invoked; it
|
||||||
|
does not wait until \fIMPI_INIT\fP is called.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Standard I/O
|
||||||
|
.
|
||||||
|
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||||
|
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||||
|
inherits standard input from \fImpirun\fP.
|
||||||
|
.B Note:
|
||||||
|
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||||
|
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||||
|
\fImpirun\fP's standard input to the rank 0 process.
|
||||||
|
.PP
|
||||||
|
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||||
|
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||||
|
\fImpirun\fP.
|
||||||
|
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||||
|
to it directly.
|
||||||
|
.PP
|
||||||
|
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||||
|
using the typical shell redirection procedure on \fImpirun\fP.
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||||
|
|
||||||
|
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||||
|
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||||
|
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||||
|
be collected into the \fImy_output\fP file.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Signal Propagation
|
||||||
|
.
|
||||||
|
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||||
|
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||||
|
a small number of seconds, then sending all processes in the job a
|
||||||
|
SIGKILL.
|
||||||
|
.
|
||||||
|
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||||
|
all processes in the job. Other signals are not currently propagated
|
||||||
|
by orterun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Termination / Signal Handling
|
||||||
|
.
|
||||||
|
During the run of an MPI application, if any rank dies abnormally
|
||||||
|
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||||
|
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||||
|
MPI application.
|
||||||
|
.PP
|
||||||
|
User signal handlers should probably avoid trying to cleanup MPI state
|
||||||
|
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||||
|
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||||
|
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||||
|
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||||
|
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||||
|
will notice that the process died due to a signal, it is probably not
|
||||||
|
necessary (and safest) for the user to only clean up non-MPI state.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Environment
|
||||||
|
.
|
||||||
|
Processes in the MPI application inherit their environment from the
|
||||||
|
Open RTE daemon upon the node on which they are running. The
|
||||||
|
environment is typically inherited from the user's shell. On remote
|
||||||
|
nodes, the exact environment is determined by the boot MCA module
|
||||||
|
used. The \fIrsh\fR launch module, for example, uses either
|
||||||
|
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||||
|
typically executes one or more of the user's shell-setup files before
|
||||||
|
launching the Open RTE daemon. When running dynamically linked
|
||||||
|
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable to be set, care must be taken to ensure that it is correctly
|
||||||
|
set when booting Open MPI.
|
||||||
|
.PP
|
||||||
|
See the "Remote Execution" section for more details.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Remote Execution
|
||||||
|
.
|
||||||
|
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||||
|
find executables on remote nodes (this is typically only necessary in
|
||||||
|
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||||
|
environments typically copy the current environment to the execution
|
||||||
|
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||||
|
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||||
|
set properly). If Open MPI was compiled with shared library support,
|
||||||
|
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable set on remote nodes as well (especially to find the shared
|
||||||
|
libraries required to run user MPI applications).
|
||||||
|
.PP
|
||||||
|
However, it is not always desirable or possible to edit shell
|
||||||
|
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||||
|
\fI--prefix\fR option is provided for some simple configurations where
|
||||||
|
this is not possible.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option takes a single argument: the base directory
|
||||||
|
on the remote node where Open MPI is installed. Open MPI will use
|
||||||
|
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||||
|
before executing any Open MPI or user applications. This allows
|
||||||
|
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||||
|
.PP
|
||||||
|
Open MPI adds the basename of the current
|
||||||
|
node's "bindir" (the directory where Open MPI's executables are
|
||||||
|
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||||
|
remote node. Similarly, Open MPI adds the basename of the current
|
||||||
|
node's "libdir" (the directory where Open MPI's libraries are
|
||||||
|
installed) to the prefix and uses that to set the
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||||
|
.TP 15
|
||||||
|
Local bindir:
|
||||||
|
/local/node/directory/bin
|
||||||
|
.TP
|
||||||
|
Local libdir:
|
||||||
|
/local/node/directory/lib64
|
||||||
|
.PP
|
||||||
|
If the following command line is used:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||||
|
|
||||||
|
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||||
|
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||||
|
remote node before attempting to execute anything.
|
||||||
|
.PP
|
||||||
|
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||||
|
for different values for different nodes.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||||
|
on the remote node are different than the local node (e.g., if "/lib"
|
||||||
|
is used on the local node, but "/lib64" is used on the remote node),
|
||||||
|
or if the installation paths are something other than a subdirectory
|
||||||
|
under a common prefix.
|
||||||
|
.PP
|
||||||
|
Note that executing \fImpirun\fR via an absolute pathname is
|
||||||
|
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||||
|
in the absolute pathname to \fImpirun\fR. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Exported Environment Variables
|
||||||
|
.
|
||||||
|
All environment variables that are named in the form OMPI_* will automatically
|
||||||
|
be exported to new processes on the local and remote nodes.
|
||||||
|
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||||
|
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||||
|
option allows the definition of new variables, note that the parser
|
||||||
|
for this option is currently not very sophisticated - it does not even
|
||||||
|
understand quoted values. Users are advised to set variables in the
|
||||||
|
environment and use \fI\-x\fP to export them; not to define them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS MCA (Modular Component Architecture)
|
||||||
|
.
|
||||||
|
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||||
|
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||||
|
MCA modules have direct impact on MPI programs because they allow tunable
|
||||||
|
parameters to be set at run time (such as which BTL communication device driver
|
||||||
|
to use, what parameters to pass to that BTL, etc.).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||||
|
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||||
|
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||||
|
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||||
|
passed.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -mca btl tcp,self -np 1 foo
|
||||||
|
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||||
|
"foo" an allocated node.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -mca btl self -np 1 foo
|
||||||
|
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||||
|
allocated node.
|
||||||
|
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch can be used multiple times to specify different
|
||||||
|
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||||
|
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||||
|
(",") separating them.
|
||||||
|
.PP
|
||||||
|
.B Note:
|
||||||
|
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||||
|
The same effect may be accomplished by setting corresponding environment
|
||||||
|
variables before running \fImpirun\fP.
|
||||||
|
The form of the environment variables that Open MPI sets are:
|
||||||
|
|
||||||
|
OMPI_<key>=<value>
|
||||||
|
.PP
|
||||||
|
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||||
|
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||||
|
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||||
|
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||||
|
depends on the specific MCA module.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Examples Section
|
||||||
|
.\" **************************
|
||||||
|
.SH EXAMPLES
|
||||||
|
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -np 1 prog1
|
||||||
|
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||||
|
executable file on each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 8 --byslot prog1
|
||||||
|
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||||
|
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||||
|
of MPI messages.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Diagnostics Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH DIAGNOSTICS
|
||||||
|
.\".TP 4
|
||||||
|
.\"Error Msg:
|
||||||
|
.\"Description
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Return Value Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.SH RETURN VALUE
|
||||||
|
.
|
||||||
|
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||||
|
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||||
|
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||||
|
internal error occurred in mpirun, the corresponding error code is returned.
|
||||||
|
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||||
|
return value of the rank of the process that \fImpirun\fP first notices died
|
||||||
|
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||||
|
be the first rank that died but is not guaranteed to be so.
|
||||||
|
.PP
|
||||||
|
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||||
|
mpirun does not indicate the exit status of the ranks.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" See Also Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH SEE ALSO
|
||||||
|
.\" orted(1)
|
177
orte/tools/ortehalt/ortehalt.c
Обычный файл
177
orte/tools/ortehalt/ortehalt.c
Обычный файл
@ -0,0 +1,177 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_PARAM_H
|
||||||
|
#include <sys/param.h>
|
||||||
|
#endif
|
||||||
|
#include <errno.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif /* HAVE_SYS_TYPES_H */
|
||||||
|
#ifdef HAVE_SYS_WAIT_H
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#endif /* HAVE_SYS_WAIT_H */
|
||||||
|
#ifdef HAVE_LIBGEN_H
|
||||||
|
#include <libgen.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/event/event.h"
|
||||||
|
#include "opal/install_dirs.h"
|
||||||
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/threads/condition.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
|
#include "opal/util/cmd_line.h"
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/trace.h"
|
||||||
|
#include "opal/version.h"
|
||||||
|
|
||||||
|
#include "orte/class/orte_pointer_array.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/sys_info.h"
|
||||||
|
#include "orte/util/universe_setup_file_io.h"
|
||||||
|
|
||||||
|
#include "orte/mca/ns/ns.h"
|
||||||
|
#include "orte/mca/gpr/gpr.h"
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "orte/mca/schema/schema.h"
|
||||||
|
#include "orte/mca/smr/smr.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/runtime.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
|
static char *orte_basename = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* setup globals for catching orterun command line options
|
||||||
|
*/
|
||||||
|
struct globals_t {
|
||||||
|
bool help;
|
||||||
|
bool version;
|
||||||
|
bool verbose;
|
||||||
|
bool quiet;
|
||||||
|
bool exit;
|
||||||
|
int exit_status;
|
||||||
|
char *wdir;
|
||||||
|
char *path;
|
||||||
|
opal_mutex_t lock;
|
||||||
|
opal_condition_t cond;
|
||||||
|
} ortehalt_globals;
|
||||||
|
|
||||||
|
|
||||||
|
opal_cmd_line_init_t cmd_line_init[] = {
|
||||||
|
/* Various "obvious" options */
|
||||||
|
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||||
|
&ortehalt_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"This help message" },
|
||||||
|
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||||
|
&ortehalt_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Print version and exit" },
|
||||||
|
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||||
|
&ortehalt_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Be verbose" },
|
||||||
|
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||||
|
&ortehalt_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Suppress helpful messages" },
|
||||||
|
|
||||||
|
/* OpenRTE arguments */
|
||||||
|
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable debugging of OpenRTE" },
|
||||||
|
|
||||||
|
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the universe name as username@hostname:universe_name for this application" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||||
|
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the root for the session directory tree for orterun ONLY" },
|
||||||
|
|
||||||
|
/* End of list */
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
#if !defined(__WINDOWS__)
|
||||||
|
extern char** environ;
|
||||||
|
#endif /* !defined(__WINDOWS__) */
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
int id, iparam;
|
||||||
|
|
||||||
|
/* Setup MCA params */
|
||||||
|
|
||||||
|
mca_base_param_init();
|
||||||
|
orte_register_params(false);
|
||||||
|
|
||||||
|
/* find our basename (the name of the executable) so that we can
|
||||||
|
use it in pretty-print error messages */
|
||||||
|
orte_basename = opal_basename(argv[0]);
|
||||||
|
|
||||||
|
/* check for daemon flags and push them into the environment
|
||||||
|
* since this isn't being automatically done
|
||||||
|
*/
|
||||||
|
id = mca_base_param_reg_int_name("orte", "debug",
|
||||||
|
"Top-level ORTE debug switch",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-ortehalt.txt", "ortehalt:environ", false,
|
||||||
|
orte_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Intialize our Open RTE environment */
|
||||||
|
/* Set the flag telling orte_init that I am NOT a
|
||||||
|
* singleton, but am "infrastructure" - prevents setting
|
||||||
|
* up incorrect infrastructure that only a singleton would
|
||||||
|
* require
|
||||||
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:init-failure", true,
|
||||||
|
"orte_init()", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
orte_finalize();
|
||||||
|
free(orte_basename);
|
||||||
|
return rc;
|
||||||
|
}
|
0
orte/tools/ortekill/.ompi_ignore
Обычный файл
0
orte/tools/ortekill/.ompi_ignore
Обычный файл
1
orte/tools/ortekill/.ompi_unignore
Обычный файл
1
orte/tools/ortekill/.ompi_unignore
Обычный файл
@ -0,0 +1 @@
|
|||||||
|
rhc
|
39
orte/tools/ortekill/Makefile.am
Обычный файл
39
orte/tools/ortekill/Makefile.am
Обычный файл
@ -0,0 +1,39 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
libs = \
|
||||||
|
$(top_builddir)/orte/liborte.la
|
||||||
|
|
||||||
|
ortekill_SOURCES = \
|
||||||
|
ortekill.c
|
||||||
|
|
||||||
|
ortekill_LDADD = $(libs)
|
||||||
|
ortekill_DEPENDENCIES = $(libs)
|
||||||
|
|
||||||
|
if OMPI_INSTALL_BINARIES
|
||||||
|
|
||||||
|
bin_PROGRAMS = ortekill
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-ortekill.txt
|
||||||
|
|
||||||
|
# AM 1.9.6 seems to have a bug in it's dependencies for install-man if
|
||||||
|
#dist_ and nodist_ are used, so explicitly add to EXTRA_DIST...
|
||||||
|
man_MANS = ortekill.1
|
||||||
|
EXTRA_DIST = ortekill.1
|
||||||
|
|
||||||
|
endif
|
130
orte/tools/ortekill/help-ortekill.txt
Обычный файл
130
orte/tools/ortekill/help-ortekill.txt
Обычный файл
@ -0,0 +1,130 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English general help file for Open RTE's orterun.
|
||||||
|
#
|
||||||
|
[orterun:init-failure]
|
||||||
|
Open RTE was unable to initialize properly. The error occured while
|
||||||
|
attempting to %s. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:usage]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Usage: %s [OPTION]... [PROGRAM]...
|
||||||
|
Start the given program using Open RTE
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:version]
|
||||||
|
%s (%s) %s
|
||||||
|
|
||||||
|
Report bugs to %s
|
||||||
|
[orterun:allocate-resources]
|
||||||
|
%s was unable to allocate enough resources to start your application.
|
||||||
|
This might be a transient error (too many nodes in the cluster were
|
||||||
|
unavailable at the time of the request) or a permenant error (you
|
||||||
|
requsted more nodes than exist in your cluster).
|
||||||
|
|
||||||
|
While probably only useful to Open RTE developers, the error returned
|
||||||
|
was %d.
|
||||||
|
[orterun:error-spawning]
|
||||||
|
%s was unable to start the specified application. An attempt has been
|
||||||
|
made to clean up all processes that did start. The error returned was
|
||||||
|
%d.
|
||||||
|
[orterun:appfile-not-found]
|
||||||
|
Unable to open the appfile:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Double check that this file exists and is readable.
|
||||||
|
[orterun:executable-not-specified]
|
||||||
|
No executable was specified on the %s command line.
|
||||||
|
|
||||||
|
Aborting.
|
||||||
|
[orterun:multi-apps-and-zero-np]
|
||||||
|
%s found multiple applications specified on the command line, with
|
||||||
|
at least one that failed to specify the number of processes to execute.
|
||||||
|
When specifying multiple applications, you must specify how many processes
|
||||||
|
of each to launch via the -np argument.
|
||||||
|
[orterun:nothing-to-do]
|
||||||
|
%s could not find anything to do.
|
||||||
|
|
||||||
|
It is possible that you forgot to specify how many processes to run
|
||||||
|
via the "-np" argument.
|
||||||
|
[orterun:call-failed]
|
||||||
|
%s encountered a %s call failure. This should not happen, and
|
||||||
|
usually indicates an error within the operating system itself.
|
||||||
|
Specifically, the following error occurred:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
The only other available information that may be helpful is the errno
|
||||||
|
that was returned: %d.
|
||||||
|
[orterun:environ]
|
||||||
|
%s was unable to set
|
||||||
|
%s = %s
|
||||||
|
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:precondition]
|
||||||
|
%s was unable to precondition transports
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:attr-failed]
|
||||||
|
%s was unable to define an attribute
|
||||||
|
Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
[orterun:proc-aborted]
|
||||||
|
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||||
|
[orterun:abnormal-exit]
|
||||||
|
WARNING: %s encountered an abnormal exit.
|
||||||
|
|
||||||
|
This means that %s exited before it received notification that all
|
||||||
|
started processes had terminated. You should double check and ensure
|
||||||
|
that there are no runaway processes still executing.
|
||||||
|
[orterun:empty-prefix]
|
||||||
|
A prefix was supplied to %s that only contained slashes.
|
||||||
|
|
||||||
|
This is a fatal error; %s will now abort. No processes were launched.
|
||||||
|
#
|
||||||
|
[debugger-mca-param-not-found]
|
||||||
|
Internal error -- the orte_base_debugger MCA parameter was not able to
|
||||||
|
be found. Please contact the Open RTE developers; this should not
|
||||||
|
happen.
|
||||||
|
#
|
||||||
|
[debugger-orte_base_user_debugger-empty]
|
||||||
|
The MCA parameter "orte_base_user_debugger" was empty, indicating that
|
||||||
|
no user-level debuggers have been defined. Please set this MCA
|
||||||
|
parameter to a value and try again.
|
||||||
|
#
|
||||||
|
[debugger-not-found]
|
||||||
|
A suitable debugger could not be found in your PATH. Check the values
|
||||||
|
specified in the orte_base_user_debugger MCA parameter for the list of
|
||||||
|
debuggers that was searched.
|
||||||
|
#
|
||||||
|
[debugger-exec-failed]
|
||||||
|
%s was unable to launch the specified debugger. This is what was
|
||||||
|
launched:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
Things to check:
|
||||||
|
|
||||||
|
- Ensure that the debugger is installed properly
|
||||||
|
- Ensure that the "%s" executable is in your path
|
||||||
|
- Ensure that any required licenses are available to run the debugger
|
||||||
|
#
|
||||||
|
[orterun:daemon-die]
|
||||||
|
%s was unable to cleanly terminate the daemons for this job. Returned value %d instead of ORTE_SUCCESS.
|
||||||
|
|
851
orte/tools/ortekill/ortekill.1
Обычный файл
851
orte/tools/ortekill/ortekill.1
Обычный файл
@ -0,0 +1,851 @@
|
|||||||
|
.\"
|
||||||
|
.\" Man page for ORTE's orterun command
|
||||||
|
.\"
|
||||||
|
.\" .TH name section center-footer left-footer center-header
|
||||||
|
.TH MPIRUN 1 "March 2006" "Open MPI" "OPEN MPI COMMANDS"
|
||||||
|
.\" **************************
|
||||||
|
.\" Name Section
|
||||||
|
.\" **************************
|
||||||
|
.SH NAME
|
||||||
|
.
|
||||||
|
orterun, mpirun, mpiexec \- Execute serial and parallel jobs in Open MPI.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
\fImpirun\fP, \fImpiexec\fP, and \fIorterun\fP are all exact synonyms for each
|
||||||
|
other. Using any of the names will result in exactly identical behavior.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Synopsis Section
|
||||||
|
.\" **************************
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Single Process Multiple Data (SPMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ options ]
|
||||||
|
.B <program>
|
||||||
|
.R [ <args> ]
|
||||||
|
.
|
||||||
|
|
||||||
|
Multiple Instruction Multiple Data (MIMD) Model:
|
||||||
|
|
||||||
|
.B mpirun
|
||||||
|
.R [ global_options ]
|
||||||
|
[ local_options1 ]
|
||||||
|
.B <program1>
|
||||||
|
.R [ <args1> ] :
|
||||||
|
[ local_options2 ]
|
||||||
|
.B <program2>
|
||||||
|
.R [ <args2> ] :
|
||||||
|
... :
|
||||||
|
[ local_optionsN ]
|
||||||
|
.B <programN>
|
||||||
|
.R [ <argsN> ]
|
||||||
|
.P
|
||||||
|
|
||||||
|
Note that in both models, invoking \fImpirun\fR via an absolute path
|
||||||
|
name is equivalent to specifying the \fI--prefix\fR option with a
|
||||||
|
\fI<dir>\fR value equivalent to the directory where \fImpirun\fR
|
||||||
|
resides, minus its last subdirectory. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Quick Summary Section
|
||||||
|
.\" **************************
|
||||||
|
.SH QUICK SUMMARY
|
||||||
|
.
|
||||||
|
If you are simply looking for how to run an MPI application, you
|
||||||
|
probably want to use a command line of the following form:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun [ -np X ] [ --hostfile <filename> ] <program>
|
||||||
|
|
||||||
|
This will run X copies of \fI<program>\fR in your current run-time
|
||||||
|
environment (if running under a supported resource manager, Open MPI's
|
||||||
|
\fImpirun\fR will usually automatically use the corresponding resource manager
|
||||||
|
process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR,
|
||||||
|
which require the use of a hostfile, or will default to running all X
|
||||||
|
copies on the localhost), scheduling (by default) in a round-robin fashion by
|
||||||
|
CPU slot. See the rest of this page for more details.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Options Section
|
||||||
|
.\" **************************
|
||||||
|
.SH OPTIONS
|
||||||
|
.
|
||||||
|
.I mpirun
|
||||||
|
will send the name of the directory where it was invoked on the local
|
||||||
|
node to each of the remote nodes, and attempt to change to that
|
||||||
|
directory. See the "Current Working Directory" section below for further
|
||||||
|
details.
|
||||||
|
.\"
|
||||||
|
.\" Start options listing
|
||||||
|
.\" Indent 10 chacters from start of first column to start of second column
|
||||||
|
.TP 10
|
||||||
|
.B <args>
|
||||||
|
Pass these run-time arguments to every new process. These must always
|
||||||
|
be the last arguments to \fImpirun\fP. If an app context file is used,
|
||||||
|
\fI<args>\fP will be ignored.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B <program>
|
||||||
|
The program executable. This is identified as the first non-recognized argument
|
||||||
|
to mpirun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -aborted\fR,\fP --aborted \fR<#>\fP
|
||||||
|
Set the maximum number of aborted processes to display.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --app \fR<appfile>\fP
|
||||||
|
Provide an appfile, ignoring all other command line options.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -bynode\fR,\fP --bynode
|
||||||
|
Allocate (map) the processes by node in a round-robin scheme.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -byslot\fR,\fP --byslot
|
||||||
|
Allocate (map) the processes by slot in a round-robin scheme. This is the
|
||||||
|
default.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -c \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debug\fR,\fP --debug
|
||||||
|
Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP
|
||||||
|
MCA parameter.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -debugger\fR,\fP --debugger
|
||||||
|
Sequence of debuggers to search for when \fI--debug\fP is used (i.e.
|
||||||
|
a synonym for \fIorte_base_user_debugger\fP MCA parameter).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||||
|
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||||
|
the parameter name; \fI<value>\fP is the parameter value.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -h\fR,\fP --help
|
||||||
|
Display help for this command
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -H \fR<host1,host2,...,hostN>\fP
|
||||||
|
Synonym for \fI-host\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||||
|
List of hosts on which to invoke processes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||||
|
Provide a hostfile to use.
|
||||||
|
.\" JJH - Should have man page for how to format a hostfile properly.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||||
|
Synonym for \fI-hostfile\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -mca\fR,\fP --mca <key> <value>
|
||||||
|
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -n\fR,\fP --n \fR<#>\fP
|
||||||
|
Synonym for \fI-np\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nolocal\fR,\fP --nolocal
|
||||||
|
Do not run any copies of the launched application on the same node as
|
||||||
|
orterun is running. This option will override listing the localhost
|
||||||
|
with \fB--host\fR or any other host-specifying mechanism.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nooversubscribe\fR,\fP --nooversubscribe
|
||||||
|
Do not oversubscribe any nodes; error (without starting any processes)
|
||||||
|
if the requested number of processes would cause oversubscription.
|
||||||
|
This option implicitly sets "max_slots" equal to the "slots" value for
|
||||||
|
each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -np \fR<#>\fP
|
||||||
|
Run this many copies of the program on the given nodes. This option
|
||||||
|
indicates that the specified file is an executable program and not an
|
||||||
|
application context. If no value is provided for the number of copies to
|
||||||
|
execute (i.e., neither the "-np" nor its synonyms are provided on the command
|
||||||
|
line), Open MPI will automatically execute a copy of the program on
|
||||||
|
each process slot (see below for description of a "process slot"). This
|
||||||
|
feature, however, can only be used in the SPMD model and will return an
|
||||||
|
error (without beginning execution of the application) otherwise.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -nw\fR,\fP --nw
|
||||||
|
Launch the processes and do not wait for their completion. mpirun will
|
||||||
|
complete as soon as successful launch occurs.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -path\fR,\fP --path \fR<path>\fP
|
||||||
|
<path> that will be used when attempting to locate requested executables.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --prefix \fR<dir>\fP
|
||||||
|
Prefix directory that will be used to set the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or
|
||||||
|
the target process. See the "Remote Execution" section, below.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -q\fR,\fP --quiet
|
||||||
|
Suppress informative messages from orterun during application execution.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --tmpdir \fR<dir>\fP
|
||||||
|
Set the root for the session directory tree for mpirun only.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -tv\fR,\fP --tv
|
||||||
|
Launch processes under the TotalView debugger.
|
||||||
|
Deprecated backwards compatibility flag. Synonym for \fI--debug\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --universe \fR<username@hostname:universe_name>\fP
|
||||||
|
For this application, set the universe name as:
|
||||||
|
username@hostname:universe_name
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -v\fR,\fP --verbose
|
||||||
|
Be verbose
|
||||||
|
.TP
|
||||||
|
.B -V\fR,\fP --version
|
||||||
|
Print version number. If no other arguments are given, this will also
|
||||||
|
cause orterun to exit.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -wd \fR<dir>\fP
|
||||||
|
Change to the directory <dir> before the user's program executes.
|
||||||
|
See the "Current Working Directory" section for notes on relative paths.
|
||||||
|
.B Note:
|
||||||
|
If the \fI-wd\fP option appears both on the command line and in an
|
||||||
|
application context, the context will take precedence over the command line.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -x \fR<env>\fP
|
||||||
|
Export the specified environment variables to the remote nodes before
|
||||||
|
executing the program. Existing environment variables can be
|
||||||
|
specified (see the Examples section, below), or new variable names
|
||||||
|
specified with corresponding values. The parser for the \fI-x\fP
|
||||||
|
option is not very sophisticated; it does not even understand quoted
|
||||||
|
values. Users are advised to set variables in the environment, and
|
||||||
|
then use \fI-x\fP to export (not define) them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.P
|
||||||
|
The following options are useful for developers; they are not generally
|
||||||
|
useful to most ORTE and/or MPI users:
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B -d\fR,\fP --debug-devel
|
||||||
|
Enable debugging of the OpenRTE (the run-time layer in Open MPI).
|
||||||
|
This is not generally useful for most users.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --debug-daemons-file
|
||||||
|
Enable debugging of any OpenRTE daemons used by this application, storing
|
||||||
|
output in files.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
.B --no-daemonize
|
||||||
|
Do not detach OpenRTE daemons used by this application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Description Section
|
||||||
|
.\" **************************
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.
|
||||||
|
One invocation of \fImpirun\fP starts an MPI application running under Open
|
||||||
|
MPI. If the application is single process multiple data (SPMD), the application
|
||||||
|
can be specified on the \fImpirun\fP command line.
|
||||||
|
|
||||||
|
If the application is multiple instruction multiple data (MIMD), comprising of
|
||||||
|
multiple programs, the set of programs and argument can be specified in one of
|
||||||
|
two ways: Extended Command Line Arguments, and Application Context.
|
||||||
|
.PP
|
||||||
|
An application context describes the MIMD program set including all arguments
|
||||||
|
in a separate file.
|
||||||
|
.\"See appcontext(5) for a description of the application context syntax.
|
||||||
|
This file essentially contains multiple \fImpirun\fP command lines, less the
|
||||||
|
command name itself. The ability to specify different options for different
|
||||||
|
instantiations of a program is another reason to use an application context.
|
||||||
|
.PP
|
||||||
|
Extended command line arguments allow for the description of the application
|
||||||
|
layout on the command line using colons (\fI:\fP) to separate the specification
|
||||||
|
of programs and arguments. Some options are globally set across all specified
|
||||||
|
programs (e.g. --hostfile), while others are specific to a single program
|
||||||
|
(e.g. -np).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Slots
|
||||||
|
.
|
||||||
|
Open MPI uses "slots" to represent a potential location for a process.
|
||||||
|
Hence, a node with 2 slots means that 2 processes can be launched on
|
||||||
|
that node. For performance, the community typically equates a "slot"
|
||||||
|
with a physical CPU, thus ensuring that any process assigned to that
|
||||||
|
slot has a dedicated processor. This is not, however, a requirement for
|
||||||
|
the operation of Open MPI.
|
||||||
|
.PP
|
||||||
|
Slots can be specified in hostfiles after the hostname. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host1.example.com slots=4
|
||||||
|
Indicates that there are 4 process slots on host1.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
If no slots value is specified, then Open MPI will automatically assign
|
||||||
|
a default value of "slots=1" to that host.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When running under resource managers (e.g., SLURM, Torque, etc.), Open
|
||||||
|
MPI will obtain both the hostnames and the number of slots directly
|
||||||
|
from the resource manger. For example, if running under a SLURM job,
|
||||||
|
Open MPI will automatically receive the hosts that SLURM has allocated
|
||||||
|
to the job as well as how many slots on each node that SLURM says
|
||||||
|
are usable - in most high-performance environments, the slots will
|
||||||
|
equate to the number of processors on the node.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
When deciding where to launch processes, Open MPI will first fill up
|
||||||
|
all available slots before oversubscribing (see "Location
|
||||||
|
Nomenclature", below, for more details on the scheduling algorithms
|
||||||
|
available). Unless told otherwise, Open MPI will arbitrarily
|
||||||
|
oversubscribe nodes. For example, if the only node available is the
|
||||||
|
localhost, Open MPI will run as many processes as specified by the
|
||||||
|
-n (or one of its variants) command line option on the
|
||||||
|
localhost (although they may run quite slowly, since they'll all be
|
||||||
|
competing for CPU and other resources).
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Limits can be placed on oversubscription with the "max_slots"
|
||||||
|
attribute in the hostfile. For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
host2.example.com slots=4 max_slots=6
|
||||||
|
Indicates that there are 4 process slots on host2. Further, Open MPI
|
||||||
|
is limited to launching a maximum of 6 processes on host2.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host3.example.com slots=2 max_slots=2
|
||||||
|
Indicates that there are 2 process slots on host3 and that no
|
||||||
|
oversubscription is allowed (similar to the \fI--nooversubscribe\fR
|
||||||
|
option).
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
host4.example.com max_slots=2
|
||||||
|
Shorthand; same as listing "slots=2 max_slots=2".
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Note that Open MPI's support for resource managers does not currently
|
||||||
|
set the "max_slots" values for hosts. If you wish to prevent
|
||||||
|
oversubscription in such scenarios, use the \fI--nooversubscribe\fR
|
||||||
|
option.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
In scenarios where the user wishes to launch an application across
|
||||||
|
all available slots by not providing a "-n" option on the mpirun
|
||||||
|
command line, Open MPI will launch a process on each process slot
|
||||||
|
for each host within the provided environment. For example, if a
|
||||||
|
hostfile has been provided, then Open MPI will spawn processes
|
||||||
|
on each identified host up to the "slots=x" limit if oversubscription
|
||||||
|
is not allowed. If oversubscription is allowed (the default), then
|
||||||
|
Open MPI will spawn processes on each host up to the "max_slots=y" limit
|
||||||
|
if that value is provided. In all cases, the "-bynode" and "-byslot"
|
||||||
|
mapping directives will be enforced to ensure proper placement of
|
||||||
|
process ranks.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Location Nomenclature
|
||||||
|
.
|
||||||
|
As described above, \fImpirun\fP can specify arbitrary locations in
|
||||||
|
the current Open MPI universe. Locations can be specified either by
|
||||||
|
CPU or by node.
|
||||||
|
|
||||||
|
.B Note:
|
||||||
|
This nomenclature does not force Open MPI to bind processes to CPUs --
|
||||||
|
specifying a location "by CPU" is really a convenience mechanism for
|
||||||
|
SMPs that ultimately maps down to a specific node.
|
||||||
|
.PP
|
||||||
|
Specifying locations by node will launch one copy of an executable per
|
||||||
|
specified node.
|
||||||
|
Using the \fI--bynode\fP option tells Open MPI to use all available nodes.
|
||||||
|
Using the \fI--byslot\fP option tells Open MPI to use all slots on an available
|
||||||
|
node before allocating resources on the next available node.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun --bynode -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on all available nodes in the Open MPI universe. MPI_COMM_WORLD rank 0
|
||||||
|
will be on node0, rank 1 will be on node1, etc. Regardless of how many slots
|
||||||
|
are available on each of the nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --byslot -np 4 a.out
|
||||||
|
Runs one copy of the the executable
|
||||||
|
.I a.out
|
||||||
|
on each slot on a given node before running the executable on other available
|
||||||
|
nodes.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Specifying Hosts
|
||||||
|
.
|
||||||
|
Hosts can be specified in a number of ways. The most common of which is in a
|
||||||
|
'hostfile' or 'machinefile'. If our hostfile contain the following information:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
|
||||||
|
\fBshell$\fP cat my-hostfile
|
||||||
|
node00 slots=2
|
||||||
|
node01 slots=2
|
||||||
|
node02 slots=2
|
||||||
|
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun --hostfile my-hostfile -np 3 a.out
|
||||||
|
This will run one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts node00,node01, and node02.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.PP
|
||||||
|
Another method for specifying hosts is directly on the command line. Here can
|
||||||
|
can include and exclude hosts from the set of hosts to run on. For example:
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host a.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --host a,b,c a.out
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I a.out
|
||||||
|
on hosts a, b, and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node00 a.out
|
||||||
|
Runs three copies of the executable
|
||||||
|
.I a.out
|
||||||
|
on host node00.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 3 --hostfile my-hostfile --host node10 a.out
|
||||||
|
This will prompt an error since node10 is not in my-hostfile; mpirun will
|
||||||
|
abort.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host a hostname : -np 2 --host b,c uptime
|
||||||
|
Runs one copy of the executable
|
||||||
|
.I hostname
|
||||||
|
on host a. And runs one copy of the executable
|
||||||
|
.I uptime
|
||||||
|
on hosts b and c.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Local Launch
|
||||||
|
.
|
||||||
|
Using the \fB--nolocal\fR option to orterun tells the system to not
|
||||||
|
launch any of the application processes on the same node that orterun
|
||||||
|
is running. While orterun typically blocks and consumes few system
|
||||||
|
resources, this option can be helpful for launching very large jobs
|
||||||
|
where orterun may actually need to use noticable amounts of memory
|
||||||
|
and/or processing time. \fB--nolocal\fR allows orteun to run without
|
||||||
|
sharing the local node with the launched applications, and likewise
|
||||||
|
allows the launched applications to run unhindered by orterun's system
|
||||||
|
usage.
|
||||||
|
.PP
|
||||||
|
Note that \fB--nolocal\fR will override any other specification to
|
||||||
|
launch the application on the local node. It will disqualify the
|
||||||
|
localhost from being capable of running any processes in the
|
||||||
|
application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
shell$ mpirun -np 1 --host localhost --nolocal hostname
|
||||||
|
This example will result in an error because orterun will not find
|
||||||
|
anywhere to launch the application.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS No Oversubscription
|
||||||
|
.
|
||||||
|
Using the \fI--nooversubscribe\fR option causes Open MPI to implicitly
|
||||||
|
set the "max_slots" value to be the same as the "slots" value for each
|
||||||
|
node. This can be especially helpful when running jobs under a
|
||||||
|
resource manager because Open MPI currently only sets the "slots"
|
||||||
|
value for each node that it obtains from the resource manager.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Application Context or Executable Program?
|
||||||
|
.
|
||||||
|
To distinguish the two different forms, \fImpirun\fP
|
||||||
|
looks on the command line for \fI--app\fP option. If
|
||||||
|
it is specified, then the file named on the command line is
|
||||||
|
assumed to be an application context. If it is not
|
||||||
|
specified, then the file is assumed to be an executable program.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Locating Files
|
||||||
|
.
|
||||||
|
If \fIno\fP relative or absolute path is specified for a file, Open MPI
|
||||||
|
will look for files by searching the directories in the user's PATH environment
|
||||||
|
variable as defined on the source node(s).
|
||||||
|
.PP
|
||||||
|
If a relative directory is specified, it must be relative to the initial
|
||||||
|
working directory determined by the specific starter used. For example when
|
||||||
|
using the rsh or ssh starters, the initial directory is $HOME by default. Other
|
||||||
|
starters may set the initial directory to the current working directory from
|
||||||
|
the invocation of \fImpirun\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Current Working Directory
|
||||||
|
.
|
||||||
|
The \fI\-wd\fP mpirun option allows the user to change to an arbitrary
|
||||||
|
directory before their program is invoked. It can also be used in application
|
||||||
|
context files to specify working directories on specific nodes and/or
|
||||||
|
for specific applications.
|
||||||
|
.PP
|
||||||
|
If the \fI\-wd\fP option appears both in a context file and on the command line,
|
||||||
|
the context file directory will override the command line value.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is specified, Open MPI will attempt to change to the
|
||||||
|
specified directory on all of the remote nodes. If this fails, \fImpirun\fP
|
||||||
|
will abort.
|
||||||
|
.PP
|
||||||
|
If the \fI-wd\fP option is \fBnot\fP specified, Open MPI will send the
|
||||||
|
directory name where \fImpirun\fP was invoked to each of the remote nodes. The
|
||||||
|
remote nodes will try to change to that directory. If they are unable (e.g., if
|
||||||
|
the directory does not exit on that node), then Open MPI will use the default
|
||||||
|
directory determined by the starter.
|
||||||
|
.PP
|
||||||
|
All directory changing occurs before the user's program is invoked; it
|
||||||
|
does not wait until \fIMPI_INIT\fP is called.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Standard I/O
|
||||||
|
.
|
||||||
|
Open MPI directs UNIX standard input to /dev/null on all processes
|
||||||
|
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process
|
||||||
|
inherits standard input from \fImpirun\fP.
|
||||||
|
.B Note:
|
||||||
|
The node that invoked \fImpirun\fP need not be the same as the node where the
|
||||||
|
MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of
|
||||||
|
\fImpirun\fP's standard input to the rank 0 process.
|
||||||
|
.PP
|
||||||
|
Open MPI directs UNIX standard output and error from remote nodes to the node
|
||||||
|
that invoked \fImpirun\fP and prints it on the standard output/error of
|
||||||
|
\fImpirun\fP.
|
||||||
|
Local processes inherit the standard output/error of \fImpirun\fP and transfer
|
||||||
|
to it directly.
|
||||||
|
.PP
|
||||||
|
Thus it is possible to redirect standard I/O for Open MPI applications by
|
||||||
|
using the typical shell redirection procedure on \fImpirun\fP.
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun -np 2 my_app < my_input > my_output
|
||||||
|
|
||||||
|
Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will
|
||||||
|
receive the stream from \fImy_input\fP on stdin. The stdin on all the other
|
||||||
|
nodes will be tied to /dev/null. However, the stdout from all nodes will
|
||||||
|
be collected into the \fImy_output\fP file.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Signal Propagation
|
||||||
|
.
|
||||||
|
When orterun receives a SIGTERM and SIGINT, it will attempt to kill
|
||||||
|
the entire job by sending all processes in the job a SIGTERM, waiting
|
||||||
|
a small number of seconds, then sending all processes in the job a
|
||||||
|
SIGKILL.
|
||||||
|
.
|
||||||
|
SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||||
|
all processes in the job. Other signals are not currently propagated
|
||||||
|
by orterun.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Termination / Signal Handling
|
||||||
|
.
|
||||||
|
During the run of an MPI application, if any rank dies abnormally
|
||||||
|
(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a
|
||||||
|
signal), \fImpirun\fP will print out an error message and kill the rest of the
|
||||||
|
MPI application.
|
||||||
|
.PP
|
||||||
|
User signal handlers should probably avoid trying to cleanup MPI state
|
||||||
|
(Open MPI is, currently, neither thread-safe nor async-signal-safe).
|
||||||
|
For example, if a segmentation fault occurs in \fIMPI_SEND\fP (perhaps because
|
||||||
|
a bad buffer was passed in) and a user signal handler is invoked, if this user
|
||||||
|
handler attempts to invoke \fIMPI_FINALIZE\fP, Bad Things could happen since
|
||||||
|
Open MPI was already "in" MPI when the error occurred. Since \fImpirun\fP
|
||||||
|
will notice that the process died due to a signal, it is probably not
|
||||||
|
necessary (and safest) for the user to only clean up non-MPI state.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Process Environment
|
||||||
|
.
|
||||||
|
Processes in the MPI application inherit their environment from the
|
||||||
|
Open RTE daemon upon the node on which they are running. The
|
||||||
|
environment is typically inherited from the user's shell. On remote
|
||||||
|
nodes, the exact environment is determined by the boot MCA module
|
||||||
|
used. The \fIrsh\fR launch module, for example, uses either
|
||||||
|
\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and
|
||||||
|
typically executes one or more of the user's shell-setup files before
|
||||||
|
launching the Open RTE daemon. When running dynamically linked
|
||||||
|
applications which require the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable to be set, care must be taken to ensure that it is correctly
|
||||||
|
set when booting Open MPI.
|
||||||
|
.PP
|
||||||
|
See the "Remote Execution" section for more details.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Remote Execution
|
||||||
|
.
|
||||||
|
Open MPI requires that the \fIPATH\fR environment variable be set to
|
||||||
|
find executables on remote nodes (this is typically only necessary in
|
||||||
|
\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled
|
||||||
|
environments typically copy the current environment to the execution
|
||||||
|
of remote jobs, so if the current environment has \fIPATH\fR and/or
|
||||||
|
\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it
|
||||||
|
set properly). If Open MPI was compiled with shared library support,
|
||||||
|
it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment
|
||||||
|
variable set on remote nodes as well (especially to find the shared
|
||||||
|
libraries required to run user MPI applications).
|
||||||
|
.PP
|
||||||
|
However, it is not always desirable or possible to edit shell
|
||||||
|
startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The
|
||||||
|
\fI--prefix\fR option is provided for some simple configurations where
|
||||||
|
this is not possible.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option takes a single argument: the base directory
|
||||||
|
on the remote node where Open MPI is installed. Open MPI will use
|
||||||
|
this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR
|
||||||
|
before executing any Open MPI or user applications. This allows
|
||||||
|
running Open MPI jobs without having pre-configued the \fIPATH\fR and
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote nodes.
|
||||||
|
.PP
|
||||||
|
Open MPI adds the basename of the current
|
||||||
|
node's "bindir" (the directory where Open MPI's executables are
|
||||||
|
installed) to the prefix and uses that to set the \fIPATH\fR on the
|
||||||
|
remote node. Similarly, Open MPI adds the basename of the current
|
||||||
|
node's "libdir" (the directory where Open MPI's libraries are
|
||||||
|
installed) to the prefix and uses that to set the
|
||||||
|
\fILD_LIBRARY_PATH\fR on the remote node. For example:
|
||||||
|
.TP 15
|
||||||
|
Local bindir:
|
||||||
|
/local/node/directory/bin
|
||||||
|
.TP
|
||||||
|
Local libdir:
|
||||||
|
/local/node/directory/lib64
|
||||||
|
.PP
|
||||||
|
If the following command line is used:
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /remote/node/directory
|
||||||
|
|
||||||
|
Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR
|
||||||
|
and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the
|
||||||
|
remote node before attempting to execute anything.
|
||||||
|
.PP
|
||||||
|
Note that \fI--prefix\fR can be set on a per-context basis, allowing
|
||||||
|
for different values for different nodes.
|
||||||
|
.PP
|
||||||
|
The \fI--prefix\fR option is not sufficient if the installation paths
|
||||||
|
on the remote node are different than the local node (e.g., if "/lib"
|
||||||
|
is used on the local node, but "/lib64" is used on the remote node),
|
||||||
|
or if the installation paths are something other than a subdirectory
|
||||||
|
under a common prefix.
|
||||||
|
.PP
|
||||||
|
Note that executing \fImpirun\fR via an absolute pathname is
|
||||||
|
equivalent to specifying \fI--prefix\fR without the last subdirectory
|
||||||
|
in the absolute pathname to \fImpirun\fR. For example:
|
||||||
|
|
||||||
|
\fBshell$\fP /usr/local/bin/mpirun ...
|
||||||
|
|
||||||
|
is equivalent to
|
||||||
|
|
||||||
|
\fBshell$\fP mpirun --prefix /usr/local
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS Exported Environment Variables
|
||||||
|
.
|
||||||
|
All environment variables that are named in the form OMPI_* will automatically
|
||||||
|
be exported to new processes on the local and remote nodes.
|
||||||
|
The \fI\-x\fP option to \fImpirun\fP can be used to export specific environment
|
||||||
|
variables to the new processes. While the syntax of the \fI\-x\fP
|
||||||
|
option allows the definition of new variables, note that the parser
|
||||||
|
for this option is currently not very sophisticated - it does not even
|
||||||
|
understand quoted values. Users are advised to set variables in the
|
||||||
|
environment and use \fI\-x\fP to export them; not to define them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS MCA (Modular Component Architecture)
|
||||||
|
.
|
||||||
|
The \fI-mca\fP switch allows the passing of parameters to various MCA modules.
|
||||||
|
.\" Open MPI's MCA modules are described in detail in ompimca(7).
|
||||||
|
MCA modules have direct impact on MPI programs because they allow tunable
|
||||||
|
parameters to be set at run time (such as which BTL communication device driver
|
||||||
|
to use, what parameters to pass to that BTL, etc.).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch takes two arguments: \fI<key>\fP and \fI<value>\fP.
|
||||||
|
The \fI<key>\fP argument generally specifies which MCA module will receive the value.
|
||||||
|
For example, the \fI<key>\fP "btl" is used to select which BTL to be used for
|
||||||
|
transporting MPI messages. The \fI<value>\fP argument is the value that is
|
||||||
|
passed.
|
||||||
|
For example:
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -mca btl tcp,self -np 1 foo
|
||||||
|
Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of
|
||||||
|
"foo" an allocated node.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -mca btl self -np 1 foo
|
||||||
|
Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an
|
||||||
|
allocated node.
|
||||||
|
.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7).
|
||||||
|
.PP
|
||||||
|
The \fI-mca\fP switch can be used multiple times to specify different
|
||||||
|
\fI<key>\fP and/or \fI<value>\fP arguments. If the same \fI<key>\fP is
|
||||||
|
specified more than once, the \fI<value>\fPs are concatenated with a comma
|
||||||
|
(",") separating them.
|
||||||
|
.PP
|
||||||
|
.B Note:
|
||||||
|
The \fI-mca\fP switch is simply a shortcut for setting environment variables.
|
||||||
|
The same effect may be accomplished by setting corresponding environment
|
||||||
|
variables before running \fImpirun\fP.
|
||||||
|
The form of the environment variables that Open MPI sets are:
|
||||||
|
|
||||||
|
OMPI_<key>=<value>
|
||||||
|
.PP
|
||||||
|
Note that the \fI-mca\fP switch overrides any previously set environment
|
||||||
|
variables. Also note that unknown \fI<key>\fP arguments are still set as
|
||||||
|
environment variable -- they are not checked (by \fImpirun\fP) for correctness.
|
||||||
|
Illegal or incorrect \fI<value>\fP arguments may or may not be reported -- it
|
||||||
|
depends on the specific MCA module.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Examples Section
|
||||||
|
.\" **************************
|
||||||
|
.SH EXAMPLES
|
||||||
|
Be sure to also see the examples in the "Location Nomenclature" section, above.
|
||||||
|
.
|
||||||
|
.TP 4
|
||||||
|
mpirun -np 1 prog1
|
||||||
|
Load and execute prog1 on one node. Search the user's $PATH for the
|
||||||
|
executable file on each node.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 8 --byslot prog1
|
||||||
|
Run 8 copies of prog1 wherever Open MPI wants to run them.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.TP
|
||||||
|
mpirun -np 4 -mca btl ib,tcp,self prog1
|
||||||
|
Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the transport
|
||||||
|
of MPI messages.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Diagnostics Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH DIAGNOSTICS
|
||||||
|
.\".TP 4
|
||||||
|
.\"Error Msg:
|
||||||
|
.\"Description
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" Return Value Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.SH RETURN VALUE
|
||||||
|
.
|
||||||
|
\fImpirun\fP returns 0 if all ranks started by \fImpirun\fP exit after calling
|
||||||
|
MPI_FINALIZE. A non-zero value is returned if an internal error occurred in
|
||||||
|
mpirun, or one or more ranks exited before calling MPI_FINALIZE. If an
|
||||||
|
internal error occurred in mpirun, the corresponding error code is returned.
|
||||||
|
In the event that one or more ranks exit before calling MPI_FINALIZE, the
|
||||||
|
return value of the rank of the process that \fImpirun\fP first notices died
|
||||||
|
before calling MPI_FINALIZE will be returned. Note that, in general, this will
|
||||||
|
be the first rank that died but is not guaranteed to be so.
|
||||||
|
.PP
|
||||||
|
However, note that if the \fI-nw\fP switch is used, the return value from
|
||||||
|
mpirun does not indicate the exit status of the ranks.
|
||||||
|
.
|
||||||
|
.\" **************************
|
||||||
|
.\" See Also Section
|
||||||
|
.\" **************************
|
||||||
|
.
|
||||||
|
.\" .SH SEE ALSO
|
||||||
|
.\" orted(1)
|
332
orte/tools/ortekill/ortekill.c
Обычный файл
332
orte/tools/ortekill/ortekill.c
Обычный файл
@ -0,0 +1,332 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_PARAM_H
|
||||||
|
#include <sys/param.h>
|
||||||
|
#endif
|
||||||
|
#include <errno.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif /* HAVE_SYS_TYPES_H */
|
||||||
|
#ifdef HAVE_SYS_WAIT_H
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#endif /* HAVE_SYS_WAIT_H */
|
||||||
|
#ifdef HAVE_LIBGEN_H
|
||||||
|
#include <libgen.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "opal/event/event.h"
|
||||||
|
#include "opal/install_dirs.h"
|
||||||
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/threads/condition.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
|
#include "opal/util/cmd_line.h"
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
#include "opal/util/trace.h"
|
||||||
|
#include "opal/version.h"
|
||||||
|
|
||||||
|
#include "orte/orte_constants.h"
|
||||||
|
|
||||||
|
#include "orte/class/orte_pointer_array.h"
|
||||||
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/sys_info.h"
|
||||||
|
#include "orte/util/universe_setup_file_io.h"
|
||||||
|
#include "orte/util/pre_condition_transports.h"
|
||||||
|
|
||||||
|
#include "orte/mca/ns/ns.h"
|
||||||
|
#include "orte/mca/gpr/gpr.h"
|
||||||
|
#include "orte/mca/pls/pls.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "orte/mca/schema/schema.h"
|
||||||
|
#include "orte/mca/smr/smr.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/runtime.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Globals
|
||||||
|
*/
|
||||||
|
static orte_jobid_t jobid = ORTE_JOBID_INVALID;
|
||||||
|
static char *orterun_basename = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* setup globals for catching orterun command line options
|
||||||
|
*/
|
||||||
|
struct globals_t {
|
||||||
|
bool help;
|
||||||
|
bool version;
|
||||||
|
bool verbose;
|
||||||
|
bool quiet;
|
||||||
|
bool exit;
|
||||||
|
bool no_wait_for_job_completion;
|
||||||
|
bool by_node;
|
||||||
|
bool by_slot;
|
||||||
|
bool per_node;
|
||||||
|
bool no_oversubscribe;
|
||||||
|
bool debugger;
|
||||||
|
bool no_local_schedule;
|
||||||
|
bool displaymapatlaunch;
|
||||||
|
int num_procs;
|
||||||
|
int exit_status;
|
||||||
|
char *hostfile;
|
||||||
|
char *env_val;
|
||||||
|
char *appfile;
|
||||||
|
char *wdir;
|
||||||
|
char *path;
|
||||||
|
opal_mutex_t lock;
|
||||||
|
opal_condition_t cond;
|
||||||
|
} orterun_globals;
|
||||||
|
static bool globals_init = false;
|
||||||
|
|
||||||
|
|
||||||
|
opal_cmd_line_init_t cmd_line_init[] = {
|
||||||
|
/* Various "obvious" options */
|
||||||
|
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||||
|
&orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"This help message" },
|
||||||
|
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
|
||||||
|
&orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Print version and exit" },
|
||||||
|
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||||
|
&orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Be verbose" },
|
||||||
|
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||||
|
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Suppress helpful messages" },
|
||||||
|
|
||||||
|
/* Use an appfile */
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "app", 1,
|
||||||
|
&orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Provide an appfile; ignore all other command line options" },
|
||||||
|
|
||||||
|
/* Number of processes; -c, -n, --n, -np, and --np are all
|
||||||
|
synonyms */
|
||||||
|
{ NULL, NULL, NULL, 'c', "np", "np", 1,
|
||||||
|
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of processes to run" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "n", "n", 1,
|
||||||
|
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of processes to run" },
|
||||||
|
|
||||||
|
/* Set a hostfile */
|
||||||
|
{ "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Provide a hostfile" },
|
||||||
|
{ "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Provide a hostfile" },
|
||||||
|
|
||||||
|
/* Don't wait for the process to finish before exiting */
|
||||||
|
{ NULL, NULL, NULL, '\0', "nw", "nw", 0,
|
||||||
|
&orterun_globals.no_wait_for_job_completion, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Launch the processes and do not wait for their completion (i.e., let orterun complete as soon a successful launch occurs)" },
|
||||||
|
|
||||||
|
/* Export environment variables; potentially used multiple times,
|
||||||
|
so it does not make sense to set into a variable */
|
||||||
|
{ NULL, NULL, NULL, 'x', NULL, NULL, 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||||
|
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },
|
||||||
|
|
||||||
|
/* Specific mapping (C, cX, N, nX) */
|
||||||
|
#if 0
|
||||||
|
/* JJH --map is not currently implemented so don't advertise it until it is */
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "map", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Mapping of processes to nodes / CPUs" },
|
||||||
|
#endif
|
||||||
|
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||||
|
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to allocate/map processes round-robin by node" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||||
|
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to allocate/map processes round-robin by slot (the default)" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "pernode", "pernode", 0,
|
||||||
|
&orterun_globals.per_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"If no number of process is specified, this will cause one process per available node to be executed" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||||
|
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||||
|
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
||||||
|
&orterun_globals.displaymapatlaunch, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Display the process map just before launch"},
|
||||||
|
|
||||||
|
/* mpiexec-like arguments */
|
||||||
|
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||||
|
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the working directory of the started processes" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "path", "path", 1,
|
||||||
|
&orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"PATH to be used to look for executables to start processes" },
|
||||||
|
/* These arguments can be specified multiple times */
|
||||||
|
#if 0
|
||||||
|
/* JMS: Removed because it's not really implemented */
|
||||||
|
{ NULL, NULL, NULL, '\0', "arch", "arch", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Architecture to start processes on" },
|
||||||
|
#endif
|
||||||
|
{ NULL, NULL, NULL, 'H', "host", "host", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"List of hosts to invoke processes on" },
|
||||||
|
|
||||||
|
/* OSC mpiexec-like arguments */
|
||||||
|
{ NULL, NULL, NULL, '\0', "nolocal", "nolocal", 0,
|
||||||
|
&orterun_globals.no_local_schedule, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Do not run any MPI applications on the local node" },
|
||||||
|
|
||||||
|
/* User-level debugger arguments */
|
||||||
|
{ NULL, NULL, NULL, '\0', "tv", "tv", 0,
|
||||||
|
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Deprecated backwards compatibility flag; synonym for \"--debug\"" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "debug", "debug", 0,
|
||||||
|
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" },
|
||||||
|
{ "orte", "base", "user_debugger", '\0', "debugger", "debugger", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Sequence of debuggers to search for when \"--debug\" is used" },
|
||||||
|
|
||||||
|
/* OpenRTE arguments */
|
||||||
|
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable debugging of OpenRTE" },
|
||||||
|
|
||||||
|
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Enable debugging of any OpenRTE daemons used by this application" },
|
||||||
|
|
||||||
|
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
|
||||||
|
|
||||||
|
{ "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Do not detach OpenRTE daemons used by this application" },
|
||||||
|
|
||||||
|
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the universe name as username@hostname:universe_name for this application" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||||
|
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Set the root for the session directory tree for orterun ONLY" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Prefix where Open MPI is installed on remote nodes" },
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, "noprefix", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Disable automatic --prefix behavior" },
|
||||||
|
|
||||||
|
/* End of list */
|
||||||
|
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
#if !defined(__WINDOWS__)
|
||||||
|
extern char** environ;
|
||||||
|
#endif /* !defined(__WINDOWS__) */
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
int id, iparam;
|
||||||
|
|
||||||
|
/* Setup MCA params */
|
||||||
|
|
||||||
|
mca_base_param_init();
|
||||||
|
orte_register_params(false);
|
||||||
|
|
||||||
|
/* find our basename (the name of the executable) so that we can
|
||||||
|
use it in pretty-print error messages */
|
||||||
|
orterun_basename = opal_basename(argv[0]);
|
||||||
|
|
||||||
|
/* Intialize our Open RTE environment */
|
||||||
|
/* Set the flag telling orte_init that I am NOT a
|
||||||
|
* singleton, but am "infrastructure" - prevents setting
|
||||||
|
* up incorrect infrastructure that only a singleton would
|
||||||
|
* require
|
||||||
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_init(true))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:init-failure", true,
|
||||||
|
"orte_init()", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check for daemon flags and push them into the environment
|
||||||
|
* since this isn't being automatically done
|
||||||
|
*/
|
||||||
|
id = mca_base_param_reg_int_name("orte_debug", "daemons",
|
||||||
|
"Whether to debug the ORTE daemons or not",
|
||||||
|
false, false, (int)false, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", "debug", "daemons");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||||
|
orterun_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
id = mca_base_param_reg_int_name("orte", "debug",
|
||||||
|
"Top-level ORTE debug switch",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||||
|
orterun_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
id = mca_base_param_reg_int_name("orte_debug", "daemons_file",
|
||||||
|
"Whether want stdout/stderr of daemons to go to a file or not",
|
||||||
|
false, false, 0, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("orte", "debug",
|
||||||
|
"daemons_file");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||||
|
orterun_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_finalize();
|
||||||
|
free(orterun_basename);
|
||||||
|
return rc;
|
||||||
|
}
|
@ -113,7 +113,6 @@ struct globals_t {
|
|||||||
bool no_oversubscribe;
|
bool no_oversubscribe;
|
||||||
bool debugger;
|
bool debugger;
|
||||||
bool no_local_schedule;
|
bool no_local_schedule;
|
||||||
bool displaymapatlaunch;
|
|
||||||
bool reuse_daemons;
|
bool reuse_daemons;
|
||||||
int num_procs;
|
int num_procs;
|
||||||
int exit_status;
|
int exit_status;
|
||||||
@ -201,7 +200,7 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||||
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
||||||
&orterun_globals.displaymapatlaunch, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Display the process map just before launch"},
|
"Display the process map just before launch"},
|
||||||
|
|
||||||
/* mpiexec-like arguments */
|
/* mpiexec-like arguments */
|
||||||
@ -419,6 +418,20 @@ int orterun(int argc, char *argv[])
|
|||||||
free(tmp);
|
free(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
id = mca_base_param_reg_int_name("rmaps_base", "display_map",
|
||||||
|
"Whether to display the process map after it is computed",
|
||||||
|
false, false, (int)false, &iparam);
|
||||||
|
if (iparam) {
|
||||||
|
char *tmp = mca_base_param_environ_variable("rmaps", "base", "display_map");
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:environ", false,
|
||||||
|
orterun_basename, tmp, "1", rc);
|
||||||
|
free(tmp);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
/* pre-condition any network transports that require it */
|
/* pre-condition any network transports that require it */
|
||||||
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) {
|
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -432,14 +445,6 @@ int orterun(int argc, char *argv[])
|
|||||||
/* construct the list of attributes */
|
/* construct the list of attributes */
|
||||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||||
|
|
||||||
if (orterun_globals.displaymapatlaunch) {
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP,
|
|
||||||
ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE))) {
|
|
||||||
opal_show_help("help-orterun.txt", "orterun:attr-failed", false,
|
|
||||||
orterun_basename, NULL, NULL, rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** setup callbacks for abort signals */
|
/** setup callbacks for abort signals */
|
||||||
opal_signal_set(&term_handler, SIGTERM,
|
opal_signal_set(&term_handler, SIGTERM,
|
||||||
abort_signal_callback, &term_handler);
|
abort_signal_callback, &term_handler);
|
||||||
@ -858,7 +863,6 @@ static int init_globals(void)
|
|||||||
orterun_globals.no_oversubscribe = false;
|
orterun_globals.no_oversubscribe = false;
|
||||||
orterun_globals.debugger = false;
|
orterun_globals.debugger = false;
|
||||||
orterun_globals.no_local_schedule = false;
|
orterun_globals.no_local_schedule = false;
|
||||||
orterun_globals.displaymapatlaunch = false;
|
|
||||||
orterun_globals.num_procs = 0;
|
orterun_globals.num_procs = 0;
|
||||||
orterun_globals.exit_status = 0;
|
orterun_globals.exit_status = 0;
|
||||||
if( NULL != orterun_globals.hostfile )
|
if( NULL != orterun_globals.hostfile )
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user