1
1
openmpi/orte/tools/orterun/orterun.c

1892 строки
69 KiB
C
Исходник Обычный вид История

/* -*- C -*-
*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <errno.h>
#include <signal.h>
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/event/event.h"
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/base.h"
#include "opal/threads/condition.h"
#include "opal/util/argv.h"
#include "opal/util/basename.h"
#include "opal/util/cmd_line.h"
#include "opal/util/opal_environ.h"
#include "opal/util/opal_getcwd.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/trace.h"
#if OPAL_ENABLE_FT == 1
#include "opal/runtime/opal_cr.h"
#endif
#include "opal/version.h"
#include "opal/runtime/opal.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss.h"
#include "orte/util/proc_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/session_dir.h"
#include "orte/util/name_fns.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_data_server.h"
/* ensure I can behave like a daemon */
#include "orte/orted/orted.h"
#include "orterun.h"
#include "totalview.h"
/*
* Globals
*/
static struct opal_event term_handler;
static struct opal_event int_handler;
#ifndef __WINDOWS__
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
static struct opal_event sigusr1_handler;
static struct opal_event sigusr2_handler;
#endif /* __WINDOWS__ */
static orte_job_t *jdata;
static char *orterun_basename = NULL;
static int num_aborted = 0;
static int num_killed = 0;
static int num_failed_start = 0;
static char **global_mca_env = NULL;
static bool have_zero_np = false;
static orte_std_cntr_t total_num_apps = 0;
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
static opal_event_t *orterun_event, *orteds_exit_event;
static char *ompi_server=NULL;
/*
* Globals
*/
struct globals_t orterun_globals;
bool globals_init = false;
opal_cmd_line_init_t cmd_line_init[] = {
/* Various "obvious" options */
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
&orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL, 'V', NULL, "version", 0,
&orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
"Print version and exit" },
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
&orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be verbose" },
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
"Suppress helpful messages" },
/* Preload the binary on the remote machine */
{ NULL, NULL, NULL, 's', NULL, "preload-binary", 0,
&orterun_globals.preload_binary, OPAL_CMD_LINE_TYPE_BOOL,
"Preload the binary on the remote machine before starting the remote process." },
/* Preload files on the remote machine */
{ NULL, NULL, NULL, '\0', NULL, "preload-files", 1,
&orterun_globals.preload_files, OPAL_CMD_LINE_TYPE_STRING,
"Preload the comma separated list of files to the remote machines current working directory before starting the remote process." },
/* Where to Preload files on the remote machine */
{ NULL, NULL, NULL, '\0', NULL, "preload-files-dest-dir", 1,
&orterun_globals.preload_files_dest_dir, OPAL_CMD_LINE_TYPE_STRING,
"The destination directory to use in conjunction with --preload-files. By default the absolute and relative paths provided by --preload-files are used." },
/* Use an appfile */
{ NULL, NULL, NULL, '\0', NULL, "app", 1,
&orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide an appfile; ignore all other command line options" },
/* Number of processes; -c, -n, --n, -np, and --np are all
synonyms */
{ NULL, NULL, NULL, 'c', "np", "np", 1,
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
"Number of processes to run" },
{ NULL, NULL, NULL, '\0', "n", "n", 1,
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
"Number of processes to run" },
/* Set a hostfile */
{ NULL, NULL, NULL, '\0', "hostfile", "hostfile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
{ NULL, NULL, NULL, '\0', "machinefile", "machinefile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
/* uri of Open MPI server, or at least where to get it */
{ NULL, NULL, NULL, '\0', "ompi-server", "ompi-server", 1,
&orterun_globals.ompi_server, OPAL_CMD_LINE_TYPE_STRING,
"Specify the URI of the Open MPI server, or the name of the file that contains that info" },
{ "carto", "file", "path", '\0', "cf", "cartofile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a cartography file" },
/* Don't wait for the process to finish before exiting */
#if 0
{ NULL, NULL, NULL, '\0', "nw", "nw", 0,
&orterun_globals.no_wait_for_job_completion, OPAL_CMD_LINE_TYPE_BOOL,
"Launch the processes and do not wait for their completion (i.e., let orterun complete as soon a successful launch occurs)" },
#endif
/* Export environment variables; potentially used multiple times,
so it does not make sense to set into a variable */
{ NULL, NULL, NULL, 'x', NULL, NULL, 1,
NULL, OPAL_CMD_LINE_TYPE_NULL,
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" },
/* Specific mapping (C, cX, N, nX) */
#if 0
/* JJH --map is not currently implemented so don't advertise it until it is */
{ NULL, NULL, NULL, '\0', NULL, "map", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Mapping of processes to nodes / CPUs" },
#endif
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 20:36:53 +04:00
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 20:36:53 +04:00
"Whether to allocate/map processes round-robin by node" },
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 20:36:53 +04:00
"Whether to allocate/map processes round-robin by slot (the default)" },
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
{ "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per node on all allocated nodes" },
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are not to be oversubscribed, even if the system supports such operation"},
{ "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display the process map just before launch"},
/* mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
"Set the working directory of the started processes" },
{ NULL, NULL, NULL, '\0', "wd", "wd", 1,
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
"Synonym for --wdir" },
{ NULL, NULL, NULL, '\0', "path", "path", 1,
&orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING,
"PATH to be used to look for executables to start processes" },
/* These arguments can be specified multiple times */
#if 0
/* JMS: Removed because it's not really implemented */
{ NULL, NULL, NULL, '\0', "arch", "arch", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Architecture to start processes on" },
#endif
{ NULL, NULL, NULL, 'H', "host", "host", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of hosts to invoke processes on" },
/* OSC mpiexec-like arguments */
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not run any MPI applications on the local node" },
/* User-level debugger arguments */
{ NULL, NULL, NULL, '\0', "tv", "tv", 0,
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
"Deprecated backwards compatibility flag; synonym for \"--debug\"" },
{ NULL, NULL, NULL, '\0', "debug", "debug", 0,
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
"Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" },
{ "orte", "base", "user_debugger", '\0', "debugger", "debugger", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Sequence of debuggers to search for when \"--debug\" is used" },
/* OpenRTE arguments */
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE" },
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Enable debugging of any OpenRTE daemons used by this application" },
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Set the universe name as username@hostname:universe_name for this application" },
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
{ NULL, NULL, NULL, '\0', NULL, "do-not-launch", 0,
&orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Prefix where Open MPI is installed on remote nodes" },
{ NULL, NULL, NULL, '\0', NULL, "noprefix", 0,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Disable automatic --prefix behavior" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
/*
* Local functions
*/
static void job_completed(int trigpipe, short event, void *arg);
static void terminated(int trigpipe, short event, void *arg);
static void abort_signal_callback(int fd, short flags, void *arg);
static void abort_exit_callback(int fd, short event, void *arg);
static void signal_forward_callback(int fd, short event, void *arg);
static int create_app(int argc, char* argv[], orte_app_context_t **app,
bool *made_app, char ***app_env);
static int init_globals(void);
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line);
static int parse_locals(int argc, char* argv[]);
static int parse_appfile(char *filename, char ***env);
static void dump_aborted_procs(void);
int orterun(int argc, char *argv[])
{
int rc;
opal_cmd_line_t cmd_line;
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
orterun_basename = opal_basename(argv[0]);
/* Setup and parse the command line */
init_globals();
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
if (ORTE_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true,
argc, argv)) ) {
return rc;
}
/* Need to initialize OPAL so that install_dirs are filled in */
/*
* NOTE: (JJH)
* We need to allow 'mca_base_cmd_line_process_args()' to process command
* line arguments *before* calling opal_init_util() since the command
* line could contain MCA parameters that affect the way opal_init_util()
* functions. AMCA parameters are one such option normally received on the
* command line that affect the way opal_init_util() behaves.
* It is "safe" to call mca_base_cmd_line_process_args() before
* opal_init_util() since mca_base_cmd_line_process_args() does *not*
* depend upon opal_init_util() functionality.
*/
opal_init_util();
/* flag that I am the HNP */
orte_process_info.hnp = true;
/* Setup MCA params */
/* Check for some "global" command line params */
parse_globals(argc, argv, &cmd_line);
OBJ_DESTRUCT(&cmd_line);
/* create a new job object to hold the info for this one - the
* jobid field will be filled in by the PLM when the job is
* launched
*/
jdata = OBJ_NEW(orte_job_t);
if (NULL == jdata) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Parse each app, adding it to the job object */
parse_locals(argc, argv);
if (0 == jdata->num_apps) {
/* This should never happen -- this case should be caught in
create_app(), but let's just double check... */
opal_show_help("help-orterun.txt", "orterun:nothing-to-do",
true, orterun_basename);
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* save the environment for launch purposes */
orte_launch_environ = opal_argv_copy(environ);
#if OPAL_ENABLE_FT == 1
/* Disable OPAL CR notifications for this tool */
opal_cr_set_enabled(false);
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
"1",
true, &environ);
#endif
/* Intialize our Open RTE environment
* Set the flag telling orte_init that I am NOT a
* singleton, but am "infrastructure" - prevents setting
* up incorrect infrastructure that only a singleton would
* require
*/
if (ORTE_SUCCESS != (rc = orte_init(ORTE_NON_TOOL))) {
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start. The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system. Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed. Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief. With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn. Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put". This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
ORTE_ERROR_LOG(rc);
return rc;
}
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start. The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system. Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed. Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief. With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn. Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put". This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
/* we are an hnp, so update the contact info field for later use */
orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
/* we are also officially a daemon, so better update that field too */
orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables in our copy. This
will ensure that any locally-spawned children will
have our executables and libraries in their path
For now, default to the prefix_dir provided in the first app_context.
Since there always MUST be at least one app_context, we are safe in
doing this.
*/
if (NULL != ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir) {
char *oldenv, *newenv, *lib_base, *bin_base;
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* Reset PATH */
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, bin_base, NULL );
oldenv = getenv("PATH");
if (NULL != oldenv) {
char *temp;
asprintf(&temp, "%s:%s", newenv, oldenv );
free( newenv );
newenv = temp;
}
opal_setenv("PATH", newenv, true, &orte_launch_environ);
if (orte_debug_flag) {
opal_output(0, "%s: reset PATH: %s", orterun_basename, newenv);
}
free(newenv);
free(bin_base);
/* Reset LD_LIBRARY_PATH */
newenv = opal_os_path( false, ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir, lib_base, NULL );
oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) {
char* temp;
asprintf(&temp, "%s:%s", newenv, oldenv);
free(newenv);
newenv = temp;
}
opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ);
if (orte_debug_flag) {
opal_output(0, "%s: reset LD_LIBRARY_PATH: %s",
orterun_basename, newenv);
}
free(newenv);
free(lib_base);
}
/* We actually do *not* want orterun to voluntarily yield() the
processor more than necessary. Orterun already blocks when
it is doing nothing, so it doesn't use any more CPU cycles than
it should; but when it *is* doing something, we do not want it
to be unnecessarily delayed because it voluntarily yielded the
processor in the middle of its work.
For example: when a message arrives at orterun, we want the
OS to wake us up in a timely fashion (which most OS's
seem good about doing) and then we want orterun to process
the message as fast as possible. If orterun yields and lets
aggressive MPI applications get the processor back, it may be a
long time before the OS schedules orterun to run again
(particularly if there is no IO event to wake it up). Hence,
routed OOB messages (for example) may be significantly delayed
before being delivered to MPI processes, which can be
problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
require OOB messages for wireup, etc.). */
opal_progress_set_yield_when_idle(false);
/* pre-condition any network transports that require it */
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) {
ORTE_ERROR_LOG(rc);
opal_show_help("help-orterun.txt", "orterun:precondition", false,
orterun_basename, NULL, NULL, rc);
return rc;
}
/* setup to listen for commands sent specifically to me, even though I would probably
* be the one sending them! Unfortunately, since I am a participating daemon,
* there are times I need to send a command to "all daemons", and that means *I* have
* to receive it too
*/
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup the data server */
if (ORTE_SUCCESS != (rc = orte_data_server_init())) {
ORTE_ERROR_LOG(rc);
return rc;
}
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
/** setup callbacks for abort signals */
opal_signal_set(&term_handler, SIGTERM,
abort_signal_callback, &term_handler);
opal_signal_add(&term_handler, NULL);
opal_signal_set(&int_handler, SIGINT,
abort_signal_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
#ifndef __WINDOWS__
/** setup callbacks for signals we should foward */
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
opal_signal_set(&sigusr1_handler, SIGUSR1,
signal_forward_callback, &sigusr1_handler);
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
opal_signal_add(&sigusr1_handler, NULL);
opal_signal_set(&sigusr2_handler, SIGUSR2,
signal_forward_callback, &sigusr2_handler);
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
opal_signal_add(&sigusr2_handler, NULL);
#endif /* __WINDOWS__ */
orte_totalview_init_before_spawn();
/* setup an event we can wait for to tell
* us to terminate - both normal and abnormal
* termination will call us here. Use the
* same exit fd as the daemon does so that orted_comm
* can cause either of us to exit since we share that code
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, job_completed))) {
opal_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
orte_exit_status = ORTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
/* Spawn the job */
rc = orte_plm.spawn(jdata);
/* now wait until the termination event fires */
opal_event_dispatch();
/* we only reach this point by jumping there due
* to an error - so just cleanup and leave
*/
DONE:
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* cleanup our data server */
orte_data_server_finalize();
orte_finalize();
free(orterun_basename);
return orte_exit_status;
}
static void job_completed(int trigpipe, short event, void *arg)
{
int rc;
orte_job_state_t exit_state;
/* close the trigger pipe so it cannot be called again */
if (0 <= trigpipe) {
close(trigpipe);
}
exit_state = jdata->state;
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
/* abnormal termination of some kind */
dump_aborted_procs();
/* If we showed more abort messages than were allowed,
show a followup message here */
if (num_failed_start > 1) {
printf("%d total process%s failed to start\n",
num_failed_start, ((num_failed_start > 1) ? "es" : ""));
}
if (num_aborted > 1) {
printf("%d total process%s aborted\n",
num_aborted, ((num_aborted > 1) ? "es" : ""));
}
if (num_killed > 1) {
printf("%d total process%s killed (some possibly by %s during cleanup)\n",
num_killed, ((num_killed > 1) ? "es" : ""), orterun_basename);
}
}
/* Make sure we propagate the exit code */
if (WIFEXITED(orte_exit_status)) {
orte_exit_status = WEXITSTATUS(orte_exit_status);
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state) {
/* ensure we don't treat this like a signal */
} else {
/* If a process was killed by a signal, then make the
* exit code of orterun be "signo + 128" so that "prog"
* and "orterun prog" will both set the same status
* value for the shell */
orte_exit_status = WTERMSIG(orte_exit_status) + 128;
}
/* the job is complete - now setup an event that will
* trigger when the orteds are gone and tell the orteds that it is
* okay to finalize and exit, we are done with them.
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, terminated))) {
opal_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
goto DONE;
}
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
opal_event_t *ev;
/* since we know that the sends didn't completely go out,
* we know that the prior event will never fire. Delete it
* for completeness, and replace it with a timeout so
* that those daemons that can respond have a chance to do
* so
When we can detect that a daemon has failed, then we would like to terminate the system without having it lock up. The "hang" is currently caused by the system attempting to send messages to the daemons (specifically, ordering them to kill their local procs and then terminate). Unfortunately, without some idea of which daemon has died, the system hangs while attempting to send a message to someone who is no longer alive. This commit introduces the necessary logic to avoid that conflict. If a PLS component can identify that a daemon has failed, then we will set a flag indicating that fact. The xcast system will subsequently check that flag and, if it is set, will send all messages direct to the recipient. In the case of "kill local procs" and "terminate", the messages will go directly to each orted, thus bypassing any orted that has failed. In addition, the xcast system will -not- wait for the messages to complete, but will return immediately (i.e., operate in non-blocking mode). Orterun will wait (via an event timer) for a period of time based on the number of daemons in the system to allow the messages to attempt to be delivered - at the end of that time, orterun will simply exit, alerting the user to the problem and -strongly- recommending they run orte-clean. I could only test this on slurm for the case where all daemons unexpectedly died - srun apparently only executes its waitpid callback when all launched functions terminate. I have asked that Jeff integrate this capability into the OOB as he is working on it so that we execute it whenever a socket to an orted is unexpectedly closed. Meantime, the functionality will rarely get called, but at least the logic is available for anyone whose environment can support it. This commit was SVN r16451.
2007-10-15 22:00:30 +04:00
*/
opal_event_del(orteds_exit_event);
ORTE_DETECT_TIMEOUT(&ev, orte_process_info.num_procs,
orte_timeout_usec_per_proc,
orte_max_timeout, terminated);
}
/* now wait to hear it has been done */
opal_event_dispatch();
/* if we cannot order the daemons to terminate, then
* all we can do is cleanly exit ourselves
*/
DONE:
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* cleanup our data server */
orte_data_server_finalize();
orte_finalize();
free(orterun_basename);
exit(rc);
}
static void terminated(int trigpipe, short event, void *arg)
{
orte_job_t *daemons;
orte_proc_t **procs;
orte_vpid_t i;
/* close the trigger pipe so it cannot be called again */
if (0 <= trigpipe) {
close(trigpipe);
}
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
#ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
#endif /* __WINDOWS__ */
/* get the daemon job object */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
/* nothing more we can do - tell user something really messed
* up and exit
*/
opal_show_help("help-orterun.txt", "orterun:no-orted-object-exit",
true, orterun_basename);
}
/* did any daemons fail to respond? Remember we already
* set ourselves to terminated
*/
if (daemons->num_terminated != daemons->num_procs) {
/* alert user to that fact and which nodes didn't respond and
* print a warning that the user may still have some manual
* cleanup to do.
*/
opal_show_help("help-orterun.txt", "orterun:unclean-exit",
true, orterun_basename);
procs = (orte_proc_t**)daemons->procs->addr;
for (i=1; i < daemons->num_procs; i++)
{
if (ORTE_PROC_STATE_TERMINATED != procs[i]->state) {
/* print out node name */
orte_node_t *node = procs[i]->node;
if (NULL != node && NULL != node->name) {
fprintf(stderr, "\t%s\n", node->name);
}
}
}
} else {
/* we cleaned up! let the user know */
if (!orterun_globals.quiet && orte_abnormal_term_ordered){
fprintf(stderr, "%s: clean termination accomplished\n\n", orterun_basename);
}
}
/* now clean ourselves up and exit */
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* cleanup our data server */
orte_data_server_finalize();
orte_finalize();
free(orterun_basename);
exit(orte_exit_status);
}
/*
* On abnormal termination - dump the
* exit status of the aborted procs.
*/
static void dump_aborted_procs(void)
{
orte_std_cntr_t i, n;
orte_proc_t *proc, **procs;
orte_app_context_t **apps;
orte_job_t **jobs, *job;
bool found=false;
/* find the job that caused the problem - be sure to start the loop
* at 1 as the daemons are in 0 and will clearly be "running", so no
* point in checking them
*/
jobs = (orte_job_t**)orte_job_data->addr;
for (n=1; n < orte_job_data->size; n++) {
if (NULL == jobs[n]) {
/* the array is left-justified, so we can quit on the first NULL */
return;
}
if (ORTE_JOB_STATE_UNDEF != jobs[n]->state &&
ORTE_JOB_STATE_INIT != jobs[n]->state &&
ORTE_JOB_STATE_LAUNCHED != jobs[n]->state &&
ORTE_JOB_STATE_RUNNING != jobs[n]->state &&
ORTE_JOB_STATE_TERMINATED != jobs[n]->state &&
ORTE_JOB_STATE_ABORT_ORDERED != jobs[n]->state) {
/* this is a guilty party */
job = jobs[n];
proc = job->aborted_proc;
procs = (orte_proc_t**)job->procs->addr;
apps = (orte_app_context_t**)job->apps->addr;
/* flag that we found at least one */
found = true;
/* cycle through and count the number that were killed or aborted */
for (i=0; i < job->procs->size; i++) {
if (NULL == procs[i]) {
/* array is left-justfied - we are done */
break;
}
if (ORTE_PROC_STATE_FAILED_TO_START == procs[i]->state) {
++num_failed_start;
} else if (ORTE_PROC_STATE_ABORTED == procs[i]->state) {
++num_aborted;
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == procs[i]->state) {
++num_killed;
}
}
if (ORTE_JOB_STATE_FAILED_TO_START == job->state) {
if (NULL == proc) {
opal_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true,
orterun_basename);
return;
}
if (ORTE_ERR_SYS_LIMITS_PIPES == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
orterun_basename, proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_PIPE_SETUP_FAILURE == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
orterun_basename, proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_SYS_LIMITS_CHILDREN == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
orterun_basename, proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_FAILED_GET_TERM_ATTRS == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
orterun_basename, proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_WDIR_NOT_FOUND == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
orterun_basename, apps[proc->app_idx]->cwd,
proc->node->name, (unsigned long)proc->name.vpid);
} else if (ORTE_ERR_EXE_NOT_FOUND == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:exe-not-found", true,
orterun_basename, apps[proc->app_idx]->app,
proc->node->name, (unsigned long)proc->name.vpid);
} else if (ORTE_ERR_EXE_NOT_ACCESSIBLE == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
orterun_basename, apps[proc->app_idx]->app, proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_PIPE_READ_FAILURE == proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
orterun_basename, proc->node->name, (unsigned long)proc->name.vpid);
} else if (0 != proc->exit_code) {
opal_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
(unsigned long)proc->name.vpid);
} else {
opal_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orterun_basename, proc->node->name);
}
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
if (NULL == proc) {
opal_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true,
orterun_basename);
} else {
opal_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
proc->node->name, orterun_basename);
}
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
if (NULL == proc) {
opal_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true,
orterun_basename);
} else {
#ifdef HAVE_STRSIGNAL
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
opal_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
proc->node->name, WTERMSIG(proc->exit_code),
strsignal(WTERMSIG(proc->exit_code)));
} else {
#endif
opal_show_help("help-orterun.txt", "orterun:proc-aborted", true,
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
proc->node->name, WTERMSIG(proc->exit_code));
#ifdef HAVE_STRSIGNAL
}
#endif
}
}
return;
}
}
/* if we got here, then we couldn't find the job that aborted -
* report that fact and give up
*/
opal_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true, orterun_basename);
}
static void timeout_callback(int fd, short ign, void *arg)
When we can detect that a daemon has failed, then we would like to terminate the system without having it lock up. The "hang" is currently caused by the system attempting to send messages to the daemons (specifically, ordering them to kill their local procs and then terminate). Unfortunately, without some idea of which daemon has died, the system hangs while attempting to send a message to someone who is no longer alive. This commit introduces the necessary logic to avoid that conflict. If a PLS component can identify that a daemon has failed, then we will set a flag indicating that fact. The xcast system will subsequently check that flag and, if it is set, will send all messages direct to the recipient. In the case of "kill local procs" and "terminate", the messages will go directly to each orted, thus bypassing any orted that has failed. In addition, the xcast system will -not- wait for the messages to complete, but will return immediately (i.e., operate in non-blocking mode). Orterun will wait (via an event timer) for a period of time based on the number of daemons in the system to allow the messages to attempt to be delivered - at the end of that time, orterun will simply exit, alerting the user to the problem and -strongly- recommending they run orte-clean. I could only test this on slurm for the case where all daemons unexpectedly died - srun apparently only executes its waitpid callback when all launched functions terminate. I have asked that Jeff integrate this capability into the OOB as he is working on it so that we execute it whenever a socket to an orted is unexpectedly closed. Meantime, the functionality will rarely get called, but at least the logic is available for anyone whose environment can support it. This commit was SVN r16451.
2007-10-15 22:00:30 +04:00
{
/* just call terminated so we don't loop back into
* trying to kill things
*/
terminated(-1, 0, NULL);
When we can detect that a daemon has failed, then we would like to terminate the system without having it lock up. The "hang" is currently caused by the system attempting to send messages to the daemons (specifically, ordering them to kill their local procs and then terminate). Unfortunately, without some idea of which daemon has died, the system hangs while attempting to send a message to someone who is no longer alive. This commit introduces the necessary logic to avoid that conflict. If a PLS component can identify that a daemon has failed, then we will set a flag indicating that fact. The xcast system will subsequently check that flag and, if it is set, will send all messages direct to the recipient. In the case of "kill local procs" and "terminate", the messages will go directly to each orted, thus bypassing any orted that has failed. In addition, the xcast system will -not- wait for the messages to complete, but will return immediately (i.e., operate in non-blocking mode). Orterun will wait (via an event timer) for a period of time based on the number of daemons in the system to allow the messages to attempt to be delivered - at the end of that time, orterun will simply exit, alerting the user to the problem and -strongly- recommending they run orte-clean. I could only test this on slurm for the case where all daemons unexpectedly died - srun apparently only executes its waitpid callback when all launched functions terminate. I have asked that Jeff integrate this capability into the OOB as he is working on it so that we execute it whenever a socket to an orted is unexpectedly closed. Meantime, the functionality will rarely get called, but at least the logic is available for anyone whose environment can support it. This commit was SVN r16451.
2007-10-15 22:00:30 +04:00
}
static void abort_exit_callback(int fd, short ign, void *arg)
{
int ret;
opal_event_t *event;
if (orte_abort_in_progress) {
/* we are already aborting - just leave it alone */
return;
}
if (!orterun_globals.quiet){
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
}
/* terminate the job - this will also wakeup orterun so
* it can report to the user and kill all the orteds.
* Check the jobid, though, just in case the user
* hit ctrl-c before we had a chance to setup the
* job in the system - in which case there is nothing
* to terminate!
*
* NOTE: we don't have to worry about jdata being NULL
* because we don't setup to trap the signals until
* after jdata has been OBJ_NEW'd
*/
if (jdata->jobid != ORTE_JOBID_INVALID) {
/* give ourselves a time limit on how long to wait
* for the job to die, just in case we can't make it go
* away for some reason. Don't send us directly back
* to job_completed, though, as that function expects
* to be triggered via orte_wakeup - we could get into
* race conditions, and the timeout won't provide
* that function with the orte_exit pipe fd so it can
* be closed
*/
ORTE_DETECT_TIMEOUT(&event, jdata->num_procs,
orte_timeout_usec_per_proc,
orte_max_timeout,
timeout_callback);
ret = orte_plm.terminate_job(ORTE_JOBID_WILDCARD);
if (ORTE_SUCCESS != ret) {
/* If we failed the terminate_job() above, then we
* need to explicitly wake ourselves up to exit
*/
orte_wakeup(ret);
}
} else {
/* if the jobid is invalid, then we didn't get to
* the point of setting the job up, so there is nothing
* to do but just clean ourselves up and exit
*/
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* need to release jdata separately as it won't be
* in the global array, and so won't be released
* during finalize
*/
OBJ_RELEASE(jdata);
orte_finalize();
free(orterun_basename);
exit(1);
}
}
/*
* Attempt to terminate the job and wait for callback indicating
* the job has been aborted.
*/
static void abort_signal_callback(int fd, short flags, void *arg)
{
opal_event_t *event;
/* if we have already ordered this once, or we are already
* aborting the job, don't keep doing it to avoid race conditions
*/
if (orte_abnormal_term_ordered || orte_abort_in_progress) {
return;
}
/* set the global abnormal exit flag so we know not to
* use the standard xcast for terminating orteds
*/
orte_abnormal_term_ordered = true;
/* We are in an event handler; the job completed procedure
will delete the signal handler that is currently running
(which is a Bad Thing), so we can't call it directly.
Instead, we have to exit this handler and setup to call
job_completed() after this. */
ORTE_DETECT_TIMEOUT(&event, 0, 0, 1, abort_exit_callback);
}
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
/**
* Pass user signals to the remote application processes
*/
static void signal_forward_callback(int fd, short event, void *arg)
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
{
struct opal_event *signal = (struct opal_event*)arg;
int signum, ret;
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
signum = OPAL_EVENT_SIGNAL(signal);
if (!orterun_globals.quiet){
fprintf(stderr, "%s: Forwarding signal %d to job\n",
orterun_basename, signum);
}
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
/** send the signal out to the processes, including any descendants */
if (ORTE_SUCCESS != (ret = orte_plm.signal_job(jdata->jobid, signum))) {
fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
signum, ret);
Add ability to trap and propagate SIGUSR1/2 to remote processes. There are a number of small changes that hit a bunch of files: 1. Changed the RMGR and PLS APIs to add "signal_job" and "signal_proc" entry points. Only the "signal_job" entries are implemented - none of the components have implementations for "signal_proc" at this time. Thus, you can signal all of the procs in a job, but cannot currently signal only one specific proc. 2. Implemented those new API functions in all components except xgrid (Brian will do so very soon). Only the rsh/ssh and fork modules have been tested, however, and only under OS-X. 3. Added signal traps and callback functions for SIGUSR1/2 to orterun/mpirun that catch those signals and call the appropriate commands to propagate them out to all processes in the job. 4. Added a new test directory under the orte branch to (eventually) hold unit and system level tests for just the run-time. Since our test branch of the repository is under restricted access, people working on the RTE were continually developing their own system-level tests - thus making it hard to help diagnose problems. I have moved the more commonly-used functions here, and added one specifically for testing the SIGUSR1/2 functionality. I will be contacting people directly to seek help with testing the changes on more environments. Other than compile issues, you should see absolutely no change in behavior on any of your systems - this additional functionality is transparent to anyone who does not issue a SIGUSR1/2 to mpirun. Ralph This commit was SVN r10258.
2006-06-08 22:27:17 +04:00
}
}
static int init_globals(void)
{
/* Only CONSTRUCT things once */
if (!globals_init) {
OBJ_CONSTRUCT(&orterun_globals.lock, opal_mutex_t);
orterun_globals.hostfile = NULL;
orterun_globals.env_val = NULL;
orterun_globals.appfile = NULL;
orterun_globals.wdir = NULL;
orterun_globals.path = NULL;
orterun_globals.ompi_server = NULL;
}
/* Reset the other fields every time */
orterun_globals.help = false;
orterun_globals.version = false;
orterun_globals.verbose = false;
orterun_globals.quiet = false;
orterun_globals.by_node = false;
orterun_globals.by_slot = false;
orterun_globals.debugger = false;
orterun_globals.num_procs = 0;
if( NULL != orterun_globals.hostfile )
free( orterun_globals.hostfile );
orterun_globals.hostfile = NULL;
if( NULL != orterun_globals.env_val )
free( orterun_globals.env_val );
orterun_globals.env_val = NULL;
if( NULL != orterun_globals.appfile )
free( orterun_globals.appfile );
orterun_globals.appfile = NULL;
if( NULL != orterun_globals.wdir )
free( orterun_globals.wdir );
orterun_globals.wdir = NULL;
if( NULL != orterun_globals.path )
free( orterun_globals.path );
orterun_globals.path = NULL;
orterun_globals.preload_binary = false;
orterun_globals.preload_files = NULL;
orterun_globals.preload_files_dest_dir = NULL;
/* All done */
globals_init = true;
return ORTE_SUCCESS;
}
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
{
int id;
/* print version if requested. Do this before check for help so
that --version --help works as one might expect. */
if (orterun_globals.version &&
!(1 == argc || orterun_globals.help)) {
char *project_name = NULL;
if (0 == strcmp(orterun_basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
opal_show_help("help-orterun.txt", "orterun:version", false,
orterun_basename, project_name, OPAL_VERSION,
PACKAGE_BUGREPORT);
/* if we were the only argument, exit */
if (2 == argc) exit(0);
}
/* Check for help request */
if (1 == argc || orterun_globals.help) {
char *args = NULL;
char *project_name = NULL;
if (0 == strcmp(orterun_basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
args = opal_cmd_line_get_usage_msg(cmd_line);
opal_show_help("help-orterun.txt", "orterun:usage", false,
orterun_basename, project_name, OPAL_VERSION,
orterun_basename, args,
PACKAGE_BUGREPORT);
free(args);
/* If someone asks for help, that should be all we do */
exit(0);
}
/* Do we want a user-level debugger? */
if (orterun_globals.debugger) {
orte_run_debugger(orterun_basename, cmd_line, argc, argv);
}
/* Allocate and map by node or by slot? Shortcut for setting an
MCA param. */
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 20:36:53 +04:00
/* Don't initialize the MCA parameter here unless we have to,
* since it really should be initialized in rmaps_base_open */
if (orterun_globals.by_node || orterun_globals.by_slot) {
char *policy = NULL;
id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
"Scheduling policy for RMAPS. [slot | node]",
false, false, "slot", &policy);
if (orterun_globals.by_node) {
orterun_globals.by_slot = false;
mca_base_param_set_string(id, "node");
} else {
orterun_globals.by_slot = true;
mca_base_param_set_string(id, "slot");
}
free(policy);
}
else {
/* Default */
(copied from a mail that has a lengthy description of this commit) I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
2005-05-31 20:36:53 +04:00
orterun_globals.by_slot = true;
}
return ORTE_SUCCESS;
}
static int parse_locals(int argc, char* argv[])
{
int i, rc, app_num;
int temp_argc;
char **temp_argv, **env;
orte_app_context_t *app;
bool made_app;
orte_std_cntr_t j, size1;
/* if the ompi-server was given, then set it up here */
if (NULL != orterun_globals.ompi_server) {
/* someone could have passed us a file instead of a uri, so
* we need to first check to see what we have - if it starts
* with "file", then we know it is a file. Otherwise, we assume
* it is a uri as provided by the ompi-server's output
* of an ORTE-standard string. Note that this is NOT a standard
* uri as it starts with the process name!
*/
if (0 == strncmp(orterun_globals.ompi_server, "file", strlen("file"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(orterun_globals.ompi_server, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
opal_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
orterun_basename, orterun_globals.ompi_server);
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
opal_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
orterun_basename, orterun_globals.ompi_server);
exit(1);
}
/* open the file and extract the uri */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
opal_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
orterun_basename, orterun_globals.ompi_server);
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
opal_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
orterun_basename, orterun_globals.ompi_server,
orterun_basename);
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
ompi_server = strdup(input);
} else {
ompi_server = strdup(orterun_globals.ompi_server);
}
}
/* Make the apps */
temp_argc = 0;
temp_argv = NULL;
opal_argv_append(&temp_argc, &temp_argv, argv[0]);
/* NOTE: This bogus env variable is necessary in the calls to
create_app(), below. See comment immediately before the
create_app() function for an explanation. */
env = NULL;
for (app_num = 0, i = 1; i < argc; ++i) {
if (0 == strcmp(argv[i], ":")) {
/* Make an app with this argv */
if (opal_argv_count(temp_argv) > 1) {
if (NULL != env) {
opal_argv_free(env);
env = NULL;
}
app = NULL;
rc = create_app(temp_argc, temp_argv, &app, &made_app, &env);
/** keep track of the number of apps - point this app_context to that index */
if (ORTE_SUCCESS != rc) {
/* Assume that the error message has already been
printed; no need to cleanup -- we can just
exit */
exit(1);
}
if (made_app) {
app->idx = app_num;
++app_num;
opal_pointer_array_add(jdata->apps, app);
++jdata->num_apps;
}
/* Reset the temps */
temp_argc = 0;
temp_argv = NULL;
opal_argv_append(&temp_argc, &temp_argv, argv[0]);
}
} else {
opal_argv_append(&temp_argc, &temp_argv, argv[i]);
}
}
if (opal_argv_count(temp_argv) > 1) {
app = NULL;
rc = create_app(temp_argc, temp_argv, &app, &made_app, &env);
if (ORTE_SUCCESS != rc) {
/* Assume that the error message has already been printed;
no need to cleanup -- we can just exit */
exit(1);
}
if (made_app) {
app->idx = app_num;
++app_num;
opal_pointer_array_add(jdata->apps, app);
++jdata->num_apps;
}
}
if (NULL != env) {
opal_argv_free(env);
}
opal_argv_free(temp_argv);
/* Once we've created all the apps, add the global MCA params to
each app's environment (checking for duplicates, of
course -- yay opal_environ_merge()). */
if (NULL != global_mca_env) {
size1 = (size_t)opal_pointer_array_get_size(jdata->apps);
/* Iterate through all the apps */
for (j = 0; j < size1; ++j) {
app = (orte_app_context_t *)
opal_pointer_array_get_item(jdata->apps, j);
if (NULL != app) {
/* Use handy utility function */
env = opal_environ_merge(global_mca_env, app->env);
opal_argv_free(app->env);
app->env = env;
}
}
}
/* Now take a subset of the MCA params and set them as MCA
overrides here in orterun (so that when we orte_init() later,
all the components see these MCA params). Here's how we decide
which subset of the MCA params we set here in orterun:
1. If any global MCA params were set, use those
2. If no global MCA params were set and there was only one app,
then use its app MCA params
3. Otherwise, don't set any
*/
env = NULL;
if (NULL != global_mca_env) {
env = global_mca_env;
} else {
if (opal_pointer_array_get_size(jdata->apps) >= 1) {
/* Remember that pointer_array's can be padded with NULL
entries; so only use the app's env if there is exactly
1 non-NULL entry */
app = (orte_app_context_t *)
opal_pointer_array_get_item(jdata->apps, 0);
if (NULL != app) {
env = app->env;
for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) {
if (NULL != opal_pointer_array_get_item(jdata->apps, j)) {
env = NULL;
break;
}
}
}
}
}
if (NULL != env) {
size1 = opal_argv_count(env);
for (j = 0; j < size1; ++j) {
putenv(env[j]);
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* This function takes a "char ***app_env" parameter to handle the
* specific case:
*
* orterun --mca foo bar -app appfile
*
* That is, we'll need to keep foo=bar, but the presence of the app
* file will cause an invocation of parse_appfile(), which will cause
* one or more recursive calls back to create_app(). Since the
* foo=bar value applies globally to all apps in the appfile, we need
* to pass in the "base" environment (that contains the foo=bar value)
* when we parse each line in the appfile.
*
* This is really just a special case -- when we have a simple case like:
*
* orterun --mca foo bar -np 4 hostname
*
* Then the upper-level function (parse_locals()) calls create_app()
* with a NULL value for app_env, meaning that there is no "base"
* environment that the app needs to be created from.
*/
static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
bool *made_app, char ***app_env)
{
opal_cmd_line_t cmd_line;
char cwd[OMPI_PATH_MAX];
int i, j, count, rc;
char *param, *value, *value2;
orte_app_context_t *app = NULL;
#if 0 /* Used only in the C/N notion case, remove to silence compiler warnings */
orte_std_cntr_t l, len;
#endif
bool map_data = false, save_arg, cmd_line_made = false;
int new_argc = 0;
char **new_argv = NULL;
*made_app = false;
/* Pre-process the command line:
- convert C, cX, N, nX arguments to "-rawmap <id> <arg>" so
that the parser can pick it up nicely.
- convert -host to -rawmap <id> <arg>
- convert -arch to -rawmap <id> <arg>
Converting these to the same argument type will a) simplify the
logic down below, and b) allow us to preserve the ordering of
these arguments as the user specified them on the command
line. */
for (i = 0; i < argc; ++i) {
map_data = false;
save_arg = true;
/* JJH To fix in the future
* Currently C/N notation is not supported so don't execute this check
* Bug: Make this context sensitive since it will not behave properly
* with the following argument set:
* $ orterun -np 2 -host c2,c3,c12 hostname
* Since it will see the hosts c2, c3, and c12 as C options instead
* of hostnames.
*/
if(false) { ; } /* Wrapper to preserve logic continuation while the below
is commented out */
#if 0
if (0 == strcmp(argv[i], "C") ||
0 == strcmp(argv[i], "N")) {
map_data = true;
}
/* Heuristic: if the string fits "[cn][0-9]+" or "[cn][0-9],",
then accept it as mapping data */
else if ('c' == argv[i][0] || 'n' == argv[i][0]) {
len = strlen(argv[i]);
if (len > 1) {
for (l = 1; l < len; ++l) {
if (',' == argv[i][l]) {
map_data = true;
break;
} else if (!isdigit(argv[i][l])) {
break;
}
}
if (l >= len) {
map_data = true;
}
}
}
#endif
#if 0
/* JMS commented out because we don't handle this in any
mapper */
/* Save -arch args */
else if (0 == strcmp("-arch", argv[i])) {
char str[2] = { '0' + ORTE_APP_CONTEXT_MAP_ARCH, '\0' };
opal_argv_append(&new_argc, &new_argv, "-rawmap");
opal_argv_append(&new_argc, &new_argv, str);
save_arg = false;
}
#endif
/* Save -hostfile args since they can be spec'd
* on a per-app_context basis
*/
else if (0 == strcmp("--hostfile",argv[i]) ||
0 == strcmp("-hostfile", argv[i]) ||
0 == strcmp("--machinefile", argv[i]) ||
0 == strcmp("-machinefile", argv[i])) {
opal_argv_append(&new_argc, &new_argv, "-rawhosts");
save_arg = false;
}
/* Save -host args */
else if (0 == strcmp("--host",argv[i]) ||
0 == strcmp("-host", argv[i]) ||
0 == strcmp("-H", argv[i])) {
char str[2] = { '0' + ORTE_APP_CONTEXT_MAP_HOSTNAME, '\0' };
opal_argv_append(&new_argc, &new_argv, "-rawmap");
opal_argv_append(&new_argc, &new_argv, str);
save_arg = false;
}
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC. The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component. This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done: As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in. In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in. The incoming changes revamp these procedures in three ways: 1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step. The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic. Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure. 2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed. The size of this data has been reduced in three ways: (a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes. To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose. (b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction. (c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using. While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly. 3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup. It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging. Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future. There are a few minor additional changes in the commit that I'll just note in passing: * propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details. * requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details. * cleanup of some stale header files This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
/* save any mca command line args so they can be passed
* separately to the daemons.
* Only do so here if we are going to parse an appfile later.
* Use Case:
* $ cat launch.appfile
* -np 1 -mca aaa bbb ./my-app -mca ccc ddd
* -np 1 -mca aaa bbb ./my-app -mca eee fff
* $ mpirun -np 2 -mca foo bar --app launch.appfile
* Only pick up '-mca foo bar' on this pass.
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC. The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component. This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done: As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in. In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in. The incoming changes revamp these procedures in three ways: 1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step. The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic. Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure. 2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed. The size of this data has been reduced in three ways: (a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes. To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose. (b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction. (c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using. While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly. 3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup. It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging. Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future. There are a few minor additional changes in the commit that I'll just note in passing: * propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details. * requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details. * cleanup of some stale header files This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
*/
if (NULL != orterun_globals.appfile) {
if (0 == strcmp("-mca", argv[i]) ||
0 == strcmp("--mca", argv[i]) ) {
opal_argv_append_nosize(&orted_cmd_line, argv[i]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
}
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC. The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component. This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done: As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in. In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in. The incoming changes revamp these procedures in three ways: 1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step. The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic. Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure. 2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed. The size of this data has been reduced in three ways: (a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes. To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose. (b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction. (c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using. While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly. 3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup. It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging. Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future. There are a few minor additional changes in the commit that I'll just note in passing: * propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details. * requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details. * cleanup of some stale header files This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
}
/* If this token was C/N map data, save it */
if (map_data) {
char str[2] = { '0' + ORTE_APP_CONTEXT_MAP_CN, '\0' };
opal_argv_append(&new_argc, &new_argv, "-rawmap");
opal_argv_append(&new_argc, &new_argv, str);
}
if (save_arg) {
opal_argv_append(&new_argc, &new_argv, argv[i]);
}
}
/* Parse application command line options. Add the -rawmap option
separately so that the user doesn't see it in the --help
message. Ditto for the -rawhosts option */
init_globals();
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
cmd_line_made = true;
opal_cmd_line_make_opt3(&cmd_line, '\0', NULL, "rawmap", 2,
"Hidden / internal parameter -- users should not use this!");
opal_cmd_line_make_opt3(&cmd_line, '\0', NULL, "rawhosts", 1, /* only one arg */
"Hidden / internal parameter -- users should not use this!");
rc = opal_cmd_line_parse(&cmd_line, true, new_argc, new_argv);
opal_argv_free(new_argv);
new_argv = NULL;
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env);
/* Is there an appfile in here? */
if (NULL != orterun_globals.appfile) {
OBJ_DESTRUCT(&cmd_line);
return parse_appfile(strdup(orterun_globals.appfile), app_env);
}
/* Setup application context */
app = OBJ_NEW(orte_app_context_t);
opal_cmd_line_get_tail(&cmd_line, &count, &app->argv);
/* See if we have anything left */
if (0 == count) {
opal_show_help("help-orterun.txt", "orterun:executable-not-specified",
true, orterun_basename, orterun_basename);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/*
* Determine the application name, and location in the argv so we do not
* accidentally pick of the application's arguments while trying to get
* our own. Example:
* mpirun -np 2 -mca foo bar ./my-app -mca bip bop
* We want to pick up '-mca foo bar' but not '-mca bip bop'
*/
for (i = 0; i < (argc - count); ++i) {
/* save any mca command line args so they can be passed
* separately to the daemons
*/
if (0 == strcmp("-mca", argv[i]) ||
0 == strcmp("--mca", argv[i]) ) {
opal_argv_append_nosize(&orted_cmd_line, argv[i]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
i += 2;
}
}
/* Grab all OMPI_* environment variables */
app->env = opal_argv_copy(*app_env);
for (i = 0; NULL != environ[i]; ++i) {
if (0 == strncmp("OMPI_", environ[i], 5)) {
opal_argv_append_nosize(&app->env, environ[i]);
}
}
/* add the ompi-server, if provided */
if (NULL != ompi_server) {
bool found_serv = false;
asprintf(&param, "OMPI_MCA_dpm_orte_server=%s", ompi_server);
/* this shouldn't exist, but if it does... */
for (i=0; i < opal_argv_count(app->env); i++) {
if (0 == strcmp(param, app->env[i])) {
free(app->env[i]);
app->env[i] = strdup(param);
found_serv = true;
break;
}
}
if (!found_serv) {
opal_argv_append_nosize(&app->env, param); /* add it */
}
free(param);
}
/* Did the user request to export any environment variables? */
if (opal_cmd_line_is_taken(&cmd_line, "x")) {
j = opal_cmd_line_get_ninsts(&cmd_line, "x");
for (i = 0; i < j; ++i) {
param = opal_cmd_line_get_param(&cmd_line, "x", i, 0);
if (NULL != strchr(param, '=')) {
opal_argv_append_nosize(&app->env, param);
} else {
value = getenv(param);
if (NULL != value) {
if (NULL != strchr(value, '=')) {
opal_argv_append_nosize(&app->env, value);
} else {
asprintf(&value2, "%s=%s", param, value);
opal_argv_append_nosize(&app->env, value2);
free(value2);
}
} else {
opal_output(0, "Warning: could not find environment variable \"%s\"\n", param);
}
}
}
}
/* Did the user request a specific path? */
if (NULL != orterun_globals.path) {
asprintf(&value, "PATH=%s", orterun_globals.path);
opal_argv_append_nosize(&app->env, value);
free(value);
}
/* Did the user request a specific wdir? */
if (NULL != orterun_globals.wdir) {
app->cwd = strdup(orterun_globals.wdir);
app->user_specified_cwd = true;
} else {
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
opal_show_help("help-orterun.txt", "orterun:init-failure",
true, "get the cwd", rc);
goto cleanup;
}
app->cwd = strdup(cwd);
app->user_specified_cwd = false;
}
/* Check to see if the user explicitly wanted to disable automatic
--prefix behavior */
if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) {
want_prefix_by_default = false;
}
/* Did the user specify a specific prefix for this app_context_t
or provide an absolute path name to argv[0]? */
if (opal_cmd_line_is_taken(&cmd_line, "prefix") ||
'/' == argv[0][0] || want_prefix_by_default) {
size_t param_len;
/* The --prefix option takes precedence over /path/to/orterun */
if (opal_cmd_line_is_taken(&cmd_line, "prefix")) {
param = opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0);
}
/* /path/to/orterun */
else if ('/' == argv[0][0]) {
char* tmp_basename = NULL;
/* If they specified an absolute path, strip off the
/bin/<exec_name>" and leave just the prefix */
param = opal_dirname(argv[0]);
/* Quick sanity check to ensure we got
something/bin/<exec_name> and that the installation
tree is at least more or less what we expect it to
be */
tmp_basename = opal_basename(param);
if (0 == strcmp("bin", tmp_basename)) {
char* tmp = param;
param = opal_dirname(tmp);
free(tmp);
} else {
free(param);
param = NULL;
}
free(tmp_basename);
}
/* --enable-orterun-prefix-default was given to orterun */
else {
param = strdup(opal_install_dirs.prefix);
}
if (NULL != param) {
/* "Parse" the param, aka remove superfluous path_sep. */
param_len = strlen(param);
while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
param[param_len-1] = '\0';
param_len--;
if (0 == param_len) {
opal_show_help("help-orterun.txt", "orterun:empty-prefix",
true, orterun_basename, orterun_basename);
return ORTE_ERR_FATAL;
}
}
app->prefix_dir = strdup(param);
}
}
/* Did the user specify a hostname? This would have been converted
* to --rawhost above
*/
if (opal_cmd_line_is_taken(&cmd_line, "rawhosts")) {
value = opal_cmd_line_get_param(&cmd_line, "rawhosts", 0, 0);
app->hostfile = strdup(value);
}
/* Did the user request any mappings? They were all converted to
--rawmap items, above. */
if (opal_cmd_line_is_taken(&cmd_line, "rawmap")) {
j = opal_cmd_line_get_ninsts(&cmd_line, "rawmap");
app->map_data = (orte_app_context_map_t**)malloc(sizeof(orte_app_context_map_t*) * j);
if (NULL == app->map_data) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
app->num_map = j;
for (i = 0; i < j; ++i) {
app->map_data[i] = NULL;
}
for (i = 0; i < j; ++i) {
value = opal_cmd_line_get_param(&cmd_line, "rawmap", i, 0);
value2 = opal_cmd_line_get_param(&cmd_line, "rawmap", i, 1);
app->map_data[i] = OBJ_NEW(orte_app_context_map_t);
if (NULL == app->map_data[i]) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
app->map_data[i]->map_type = value[0] - '0';
app->map_data[i]->map_data = strdup(value2);
/* map_data = true;
* JJH - This activates the C/N mapping stuff,
* or at least allows us to pass the 'num_procs' check below.
* since it is not implemented yet, leave commented. */
}
}
/* Get the numprocs */
app->num_procs = (orte_std_cntr_t)orterun_globals.num_procs;
/* If the user didn't specify the number of processes to run, then we
default to launching an app process using every slot. We can't do
anything about that here - we leave it to the RMAPS framework's
components to note this and deal with it later.
HOWEVER, we ONLY support this mode of operation if the number of
app_contexts is equal to ONE. If the user provides multiple applications,
we simply must have more information - in this case, generate an
error.
*/
if (app->num_procs == 0) {
have_zero_np = true; /** flag that we have a zero_np situation */
}
if (0 < total_num_apps && have_zero_np) {
/** we have more than one app and a zero_np - that's no good.
* note that we have to do this as a two step logic check since
* the user may fail to specify num_procs for the first app, but
* then give us another application.
*/
opal_show_help("help-orterun.txt", "orterun:multi-apps-and-zero-np",
true, orterun_basename, NULL);
return ORTE_ERR_FATAL;
}
total_num_apps++;
/* Preserve if we are to preload the binary */
app->preload_binary = orterun_globals.preload_binary;
if( NULL != orterun_globals.preload_files)
app->preload_files = strdup(orterun_globals.preload_files);
else
app->preload_files = NULL;
if( NULL != orterun_globals.preload_files_dest_dir)
app->preload_files_dest_dir = strdup(orterun_globals.preload_files_dest_dir);
else
app->preload_files_dest_dir = NULL;
/* Do not try to find argv[0] here -- the starter is responsible
for that because it may not be relevant to try to find it on
the node where orterun is executing. So just strdup() argv[0]
into app. */
app->app = strdup(app->argv[0]);
if (NULL == app->app) {
opal_show_help("help-orterun.txt", "orterun:call-failed",
true, orterun_basename, "library", "strdup returned NULL", errno);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
*app_ptr = app;
app = NULL;
*made_app = true;
/* All done */
cleanup:
if (NULL != app) {
OBJ_RELEASE(app);
}
if (NULL != new_argv) {
opal_argv_free(new_argv);
}
if (cmd_line_made) {
OBJ_DESTRUCT(&cmd_line);
}
return rc;
}
static int parse_appfile(char *filename, char ***env)
{
size_t i, len;
FILE *fp;
char line[BUFSIZ];
int rc, argc, app_num;
char **argv;
orte_app_context_t *app;
bool blank, made_app;
char bogus[] = "bogus ";
char **tmp_env;
/*
* Make sure to clear out this variable so we don't do anything odd in
* app_create()
*/
if( NULL != orterun_globals.appfile ) {
free( orterun_globals.appfile );
orterun_globals.appfile = NULL;
}
/* Try to open the file */
fp = fopen(filename, "r");
if (NULL == fp) {
opal_show_help("help-orterun.txt", "orterun:appfile-not-found", true,
filename);
return ORTE_ERR_NOT_FOUND;
}
/* Read in line by line */
line[sizeof(line) - 1] = '\0';
app_num = 0;
do {
/* We need a bogus argv[0] (because when argv comes in from
the command line, argv[0] is "orterun", so the parsing
logic ignores it). So create one here rather than making
an argv and then pre-pending a new argv[0] (which would be
rather inefficient). */
line[0] = '\0';
strcat(line, bogus);
if (NULL == fgets(line + sizeof(bogus) - 1,
sizeof(line) - sizeof(bogus) - 1, fp)) {
break;
}
/* Remove a trailing newline */
len = strlen(line);
if (len > 0 && '\n' == line[len - 1]) {
line[len - 1] = '\0';
if (len > 0) {
--len;
}
}
/* Remove comments */
for (i = 0; i < len; ++i) {
if ('#' == line[i]) {
line[i] = '\0';
break;
} else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) {
line[i] = '\0';
break;
}
}
/* Is this a blank line? */
len = strlen(line);
for (blank = true, i = sizeof(bogus); i < len; ++i) {
if (!isspace(line[i])) {
blank = false;
break;
}
}
if (blank) {
continue;
}
/* We got a line with *something* on it. So process it */
argv = opal_argv_split(line, ' ');
argc = opal_argv_count(argv);
if (argc > 0) {
/* Create a temporary env to use in the recursive call --
that is: don't disturb the original env so that we can
have a consistent global env. This allows for the
case:
orterun --mca foo bar --appfile file
where the "file" contains multiple apps. In this case,
each app in "file" will get *only* foo=bar as the base
environment from which its specific environment is
constructed. */
if (NULL != *env) {
tmp_env = opal_argv_copy(*env);
if (NULL == tmp_env) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
} else {
tmp_env = NULL;
}
rc = create_app(argc, argv, &app, &made_app, &tmp_env);
if (ORTE_SUCCESS != rc) {
/* Assume that the error message has already been
printed; no need to cleanup -- we can just exit */
exit(1);
}
if (NULL != tmp_env) {
opal_argv_free(tmp_env);
}
if (made_app) {
app->idx = app_num;
++app_num;
opal_pointer_array_add(jdata->apps, app);
++jdata->num_apps;
}
}
} while (!feof(fp));
fclose(fp);
/* All done */
free(filename);
return ORTE_SUCCESS;
}