2005-03-14 20:57:21 +00:00
/* -*- C -*-
*
2007-03-16 23:11:45 +00:00
* Copyright ( c ) 2004 - 2007 The Trustees of Indiana University and Indiana
2005-11-05 19:57:48 +00:00
* University Research and Technology
* Corporation . All rights reserved .
2008-02-28 05:32:23 +00:00
* Copyright ( c ) 2004 - 2008 The University of Tennessee and The University
2005-11-05 19:57:48 +00:00
* of Tennessee Research Foundation . All rights
* reserved .
2005-09-20 17:09:11 +00:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
2005-03-14 20:57:21 +00:00
* University of Stuttgart . All rights reserved .
2005-03-24 12:43:37 +00:00
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2007-01-08 20:25:26 +00:00
* Copyright ( c ) 2006 - 2007 Cisco Systems , Inc . All rights reserved .
2009-01-30 18:50:10 +00:00
* Copyright ( c ) 2007 - 2009 Sun Microsystems , Inc . All rights reserved .
2007-06-05 03:03:59 +00:00
* Copyright ( c ) 2007 Los Alamos National Security , LLC . All rights
* reserved .
2005-03-14 20:57:21 +00:00
* $ COPYRIGHT $
2005-09-20 17:09:11 +00:00
*
2005-03-14 20:57:21 +00:00
* Additional copyrights may follow
2005-09-20 17:09:11 +00:00
*
2005-03-14 20:57:21 +00:00
* $ HEADER $
*/
# include "orte_config.h"
2008-02-28 01:57:57 +00:00
# include "orte/constants.h"
2007-07-19 19:00:06 +00:00
2009-03-13 02:10:32 +00:00
# ifdef HAVE_STRING_H
# include <string.h>
# endif
2005-03-14 20:57:21 +00:00
# include <stdio.h>
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# endif
# include <errno.h>
# include <signal.h>
# include <ctype.h>
2005-12-17 22:05:10 +00:00
# ifdef HAVE_SYS_TYPES_H
2005-04-01 00:30:37 +00:00
# include <sys/types.h>
2005-12-17 22:05:10 +00:00
# endif /* HAVE_SYS_TYPES_H */
# ifdef HAVE_SYS_WAIT_H
2005-04-01 00:30:37 +00:00
# include <sys/wait.h>
2005-12-17 22:05:10 +00:00
# endif /* HAVE_SYS_WAIT_H */
2007-01-25 14:17:44 +00:00
# ifdef HAVE_SYS_TIME_H
# include <sys/time.h>
2007-04-01 16:16:54 +00:00
# endif /* HAVE_SYS_TIME_H */
2005-03-14 20:57:21 +00:00
2005-07-03 23:09:55 +00:00
# include "opal/event/event.h"
2007-04-21 00:15:05 +00:00
# include "opal/mca/installdirs/installdirs.h"
2005-09-19 17:20:01 +00:00
# include "opal/mca/base/base.h"
2009-08-11 02:51:27 +00:00
# include "opal/mca/paffinity/base/base.h"
2005-07-04 00:13:44 +00:00
# include "opal/util/argv.h"
2009-02-14 02:26:12 +00:00
# include "opal/util/output.h"
2005-09-19 17:20:01 +00:00
# include "opal/util/basename.h"
2005-07-04 00:13:44 +00:00
# include "opal/util/cmd_line.h"
2005-09-19 17:20:01 +00:00
# include "opal/util/opal_environ.h"
2008-02-28 01:57:57 +00:00
# include "opal/util/opal_getcwd.h"
2008-06-09 14:53:58 +00:00
# include "orte/util/show_help.h"
2008-03-06 21:36:32 +00:00
# include "opal/sys/atomic.h"
2007-03-16 23:11:45 +00:00
# if OPAL_ENABLE_FT == 1
# include "opal/runtime/opal_cr.h"
# endif
2006-06-09 17:21:23 +00:00
# include "opal/version.h"
2007-04-21 00:15:05 +00:00
# include "opal/runtime/opal.h"
2007-07-19 19:00:06 +00:00
# include "opal/util/os_path.h"
2009-01-25 12:39:24 +00:00
# include "opal/util/path.h"
2008-02-28 05:32:23 +00:00
# include "opal/class/opal_pointer_array.h"
2008-02-28 01:57:57 +00:00
# include "opal/dss/dss.h"
2008-02-28 05:32:23 +00:00
2005-09-19 17:20:01 +00:00
# include "orte/util/proc_info.h"
2006-09-14 15:27:17 +00:00
# include "orte/util/pre_condition_transports.h"
2008-02-28 01:57:57 +00:00
# include "orte/util/session_dir.h"
2008-12-10 17:10:39 +00:00
# include "orte/util/hnp_contact.h"
2005-03-14 20:57:21 +00:00
2008-06-09 13:08:54 +00:00
# include "orte/mca/odls/odls.h"
2008-02-28 01:57:57 +00:00
# include "orte/mca/plm/plm.h"
2007-07-12 19:53:18 +00:00
# include "orte/mca/rml/rml.h"
2009-02-14 02:26:12 +00:00
# include "orte/mca/rml/rml_types.h"
2008-04-16 14:27:42 +00:00
# include "orte/mca/rml/base/rml_contact.h"
2005-09-19 17:20:01 +00:00
# include "orte/mca/errmgr/errmgr.h"
2009-05-11 14:11:44 +00:00
# include "orte/mca/grpcomm/grpcomm.h"
2005-03-14 20:57:21 +00:00
2005-09-19 17:20:01 +00:00
# include "orte/runtime/runtime.h"
2008-02-28 01:57:57 +00:00
# include "orte/runtime/orte_globals.h"
2005-09-19 17:20:01 +00:00
# include "orte/runtime/orte_wait.h"
2008-02-28 01:57:57 +00:00
# include "orte/runtime/orte_data_server.h"
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 17:58:59 +00:00
# include "orte/runtime/orte_locks.h"
2005-03-14 20:57:21 +00:00
2007-07-12 19:53:18 +00:00
/* ensure I can behave like a daemon */
# include "orte/orted/orted.h"
2008-06-18 15:28:46 +00:00
# include "debuggers.h"
2005-08-31 16:15:59 +00:00
# include "orterun.h"
2005-03-14 20:57:21 +00:00
/*
* Globals
*/
2005-07-03 23:09:55 +00:00
static struct opal_event term_handler ;
static struct opal_event int_handler ;
2006-07-11 05:24:08 +00:00
# ifndef __WINDOWS__
2006-06-08 18:27:17 +00:00
static struct opal_event sigusr1_handler ;
static struct opal_event sigusr2_handler ;
2009-01-30 18:50:10 +00:00
static struct opal_event sigtstp_handler ;
static struct opal_event sigcont_handler ;
2006-07-11 05:24:08 +00:00
# endif /* __WINDOWS__ */
2008-02-28 01:57:57 +00:00
static orte_job_t * jdata ;
2005-04-12 16:01:30 +00:00
static char * orterun_basename = NULL ;
2005-04-15 21:52:58 +00:00
static int num_aborted = 0 ;
static int num_killed = 0 ;
2008-02-28 01:57:57 +00:00
static int num_failed_start = 0 ;
2005-08-08 16:42:28 +00:00
static char * * global_mca_env = NULL ;
2006-07-10 21:25:33 +00:00
static bool have_zero_np = false ;
2006-08-15 19:54:10 +00:00
static orte_std_cntr_t total_num_apps = 0 ;
2006-09-15 02:52:08 +00:00
static bool want_prefix_by_default = ( bool ) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT ;
2008-02-28 01:57:57 +00:00
static opal_event_t * orterun_event , * orteds_exit_event ;
static char * ompi_server = NULL ;
2008-06-02 21:46:34 +00:00
static opal_event_t * abort_exit_event = NULL ;
2008-06-09 13:08:54 +00:00
static bool forcibly_die = false ;
2008-06-10 17:53:28 +00:00
static opal_event_t * timeout_ev = NULL ;
2008-12-09 23:49:02 +00:00
static bool profile_is_set = false ;
2009-02-25 03:10:21 +00:00
static bool signals_set = false ;
2008-06-10 17:53:28 +00:00
2005-03-14 20:57:21 +00:00
/*
2007-07-10 12:53:48 +00:00
* Globals
2005-03-14 20:57:21 +00:00
*/
2008-03-06 19:35:57 +00:00
struct orterun_globals_t orterun_globals ;
static bool globals_init = false ;
2005-03-14 20:57:21 +00:00
2008-03-06 19:35:57 +00:00
static opal_cmd_line_init_t cmd_line_init [ ] = {
2005-03-14 20:57:21 +00:00
/* Various "obvious" options */
2005-09-04 20:54:19 +00:00
{ NULL , NULL , NULL , ' h ' , NULL , " help " , 0 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . help , OPAL_CMD_LINE_TYPE_BOOL ,
2005-03-14 20:57:21 +00:00
" This help message " } ,
2006-06-09 17:21:23 +00:00
{ NULL , NULL , NULL , ' V ' , NULL , " version " , 0 ,
& orterun_globals . version , OPAL_CMD_LINE_TYPE_BOOL ,
" Print version and exit " } ,
2005-03-14 20:57:21 +00:00
{ NULL , NULL , NULL , ' v ' , NULL , " verbose " , 0 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . verbose , OPAL_CMD_LINE_TYPE_BOOL ,
2005-03-14 20:57:21 +00:00
" Be verbose " } ,
2006-06-26 18:21:45 +00:00
{ NULL , NULL , NULL , ' q ' , NULL , " quiet " , 0 ,
& orterun_globals . quiet , OPAL_CMD_LINE_TYPE_BOOL ,
" Suppress helpful messages " } ,
2008-12-24 15:27:46 +00:00
{ NULL , NULL , NULL , ' \0 ' , " report-pid " , " report-pid " , 1 ,
& orterun_globals . report_pid , OPAL_CMD_LINE_TYPE_STRING ,
" Printout pid on stdout [-], stderr [+], or a file [anything else] " } ,
{ NULL , NULL , NULL , ' \0 ' , " report-uri " , " report-uri " , 1 ,
& orterun_globals . report_uri , OPAL_CMD_LINE_TYPE_STRING ,
" Printout URI on stdout [-], stderr [+], or a file [anything else] " } ,
2008-06-24 17:50:56 +00:00
/* hetero apps */
{ " orte " , " hetero " , " apps " , ' \0 ' , NULL , " hetero " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries " } ,
2008-05-29 14:11:31 +00:00
/* select XML output */
2008-08-14 18:59:01 +00:00
{ " orte " , " xml " , " output " , ' \0 ' , " xml " , " xml " , 0 ,
2008-06-04 20:53:12 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2008-05-29 14:11:31 +00:00
" Provide all output in XML format " } ,
2009-09-02 18:03:10 +00:00
{ " orte " , " xml " , " file " , ' \0 ' , " xml-file " , " xml-file " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide all output in XML format to the specified file " } ,
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
/* tag output */
{ " orte " , " tag " , " output " , ' \0 ' , " tag-output " , " tag-output " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Tag all output with [job,rank] " } ,
2009-01-30 22:47:30 +00:00
{ " orte " , " timestamp " , " output " , ' \0 ' , " timestamp-output " , " timestamp-output " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Timestamp all application process output " } ,
{ " orte " , " output " , " filename " , ' \0 ' , " output-filename " , " output-filename " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Redirect output from application processes into filename.rank " } ,
{ " orte " , " xterm " , NULL , ' \0 ' , " xterm " , " xterm " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Create a new xterm window and display output from the specified ranks there " } ,
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
/* select stdin option */
{ NULL , NULL , NULL , ' \0 ' , " stdin " , " stdin " , 1 ,
& orterun_globals . stdin_target , OPAL_CMD_LINE_TYPE_STRING ,
" Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0) " } ,
Per the July technical meeting:
Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways.
Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument.
For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work.
This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do.
To solve this problem, we did the following:
1. removed all mca params from the individual plms for the launch agent
2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted".
3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values
4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry.
5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices
Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct.
This commit was SVN r19097.
2008-07-30 18:26:24 +00:00
/* Specify the launch agent to be used */
2008-08-14 18:59:01 +00:00
{ " orte " , " launch " , " agent " , ' \0 ' , " launch-agent " , " launch-agent " , 1 ,
Per the July technical meeting:
Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways.
Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument.
For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work.
This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do.
To solve this problem, we did the following:
1. removed all mca params from the individual plms for the launch agent
2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted".
3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values
4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry.
5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices
Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct.
This commit was SVN r19097.
2008-07-30 18:26:24 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Command used to start processes on remote nodes (default: orted) " } ,
2007-03-16 23:11:45 +00:00
/* Preload the binary on the remote machine */
{ NULL , NULL , NULL , ' s ' , NULL , " preload-binary " , 0 ,
& orterun_globals . preload_binary , OPAL_CMD_LINE_TYPE_BOOL ,
" Preload the binary on the remote machine before starting the remote process. " } ,
/* Preload files on the remote machine */
{ NULL , NULL , NULL , ' \0 ' , NULL , " preload-files " , 1 ,
& orterun_globals . preload_files , OPAL_CMD_LINE_TYPE_STRING ,
" Preload the comma separated list of files to the remote machines current working directory before starting the remote process. " } ,
/* Where to Preload files on the remote machine */
{ NULL , NULL , NULL , ' \0 ' , NULL , " preload-files-dest-dir " , 1 ,
& orterun_globals . preload_files_dest_dir , OPAL_CMD_LINE_TYPE_STRING ,
" The destination directory to use in conjunction with --preload-files. By default the absolute and relative paths provided by --preload-files are used. " } ,
2005-03-14 20:57:21 +00:00
/* Use an appfile */
{ NULL , NULL , NULL , ' \0 ' , NULL , " app " , 1 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . appfile , OPAL_CMD_LINE_TYPE_STRING ,
2005-03-14 20:57:21 +00:00
" Provide an appfile; ignore all other command line options " } ,
/* Number of processes; -c, -n, --n, -np, and --np are all
synonyms */
{ NULL , NULL , NULL , ' c ' , " np " , " np " , 1 ,
2006-09-25 19:41:54 +00:00
& orterun_globals . num_procs , OPAL_CMD_LINE_TYPE_INT ,
2005-03-14 20:57:21 +00:00
" Number of processes to run " } ,
{ NULL , NULL , NULL , ' \0 ' , " n " , " n " , 1 ,
2006-09-25 19:41:54 +00:00
& orterun_globals . num_procs , OPAL_CMD_LINE_TYPE_INT ,
2005-03-14 20:57:21 +00:00
" Number of processes to run " } ,
2006-07-10 21:25:33 +00:00
2005-03-14 20:57:21 +00:00
/* Set a hostfile */
2008-02-28 01:57:57 +00:00
{ NULL , NULL , NULL , ' \0 ' , " hostfile " , " hostfile " , 1 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
2005-03-18 23:40:08 +00:00
" Provide a hostfile " } ,
2008-02-28 01:57:57 +00:00
{ NULL , NULL , NULL , ' \0 ' , " machinefile " , " machinefile " , 1 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
2005-03-14 20:57:21 +00:00
" Provide a hostfile " } ,
2008-04-17 13:50:59 +00:00
{ " orte " , " default " , " hostfile " , ' \0 ' , " default-hostfile " , " default-hostfile " , 1 ,
2008-03-05 04:54:57 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a default hostfile " } ,
2008-04-17 13:50:59 +00:00
{ " opal " , " if " , " do_not_resolve " , ' \0 ' , " do-not-resolve " , " do-not-resolve " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Do not attempt to resolve interfaces " } ,
2008-03-05 04:54:57 +00:00
2008-02-28 01:57:57 +00:00
/* uri of Open MPI server, or at least where to get it */
{ NULL , NULL , NULL , ' \0 ' , " ompi-server " , " ompi-server " , 1 ,
& orterun_globals . ompi_server , OPAL_CMD_LINE_TYPE_STRING ,
2008-04-04 19:17:28 +00:00
" Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info " } ,
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
{ NULL , NULL , NULL , ' \0 ' , " wait-for-server " , " wait-for-server " , 0 ,
& orterun_globals . wait_for_server , OPAL_CMD_LINE_TYPE_BOOL ,
" If ompi-server is not already running, wait until it is detected (default: false) " } ,
{ NULL , NULL , NULL , ' \0 ' , " server-wait-time " , " server-wait-time " , 1 ,
& orterun_globals . server_wait_timeout , OPAL_CMD_LINE_TYPE_INT ,
" Time in seconds to wait for ompi-server (default: 10 sec) " } ,
2008-02-28 01:57:57 +00:00
2008-01-23 09:20:34 +00:00
{ " carto " , " file " , " path " , ' \0 ' , " cf " , " cartofile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a cartography file " } ,
2008-02-28 01:57:57 +00:00
2009-08-13 16:08:43 +00:00
{ " orte " , " rankfile " , NULL , ' \0 ' , " rf " , " rankfile " , 1 ,
2008-07-07 13:46:22 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a rankfile file " } ,
2005-03-14 20:57:21 +00:00
/* Export environment variables; potentially used multiple times,
so it does not make sense to set into a variable */
{ NULL , NULL , NULL , ' x ' , NULL , NULL , 1 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_NULL ,
2005-03-14 20:57:21 +00:00
" Export an environment variable, optionally specifying a value (e.g., \" -x foo \" exports the environment variable foo and takes its value from the current environment; \" -x foo=bar \" exports the environment variable name foo and sets its value to \" bar \" in the started processes) " } ,
2008-05-29 14:11:31 +00:00
/* Mapping options */
(copied from a mail that has a lengthy description of this commit)
I spoke with Tim about this the other day -- he gave me the green
light to go ahead with this, but it turned into a bigger job than I
thought it would be. I revamped how the default RAS scheduling and
round_robin RMAPS mapping occurs. The previous algorithms were pretty
brain dead, and ignored the "slots" and "max_slots" tokens in
hostfiles. I considered this a big enough problem to fix it for the
beta (because there is currently no way to control where processes are
launched on SMPs).
There's still some more bells and whistles that I'd like to implement,
but there's no hurry, and they can go on the trunk at any time. My
patches below are for what I considered "essential", and do the
following:
- honor the "slots" and "max-slots" tokens in the hostfile (and all
their synonyms), meaning that we allocate/map until we fill slots,
and if there are still more processes to allocate/map, we keep going
until we fill max-slots (i.e., only oversubscribe a node if we have
to).
- offer two different algorithms, currently supported by two new
options to orterun. Remember that there are two parts here -- slot
allocation and process mapping. Slot allocation controls how many
processes we'll be running on a node. After that decision has been
made, process mapping effectively controls where the ranks of
MPI_COMM_WORLD (MCW) are placed. Some of the examples given below
don't make sense unless you remember that there is a difference
between the two (which makes total sense, but you have to think
about it in terms of both things):
1. "-bynode": allocates/maps one process per node in a round-robin
fashion until all slots on the node are taken. If we still have more
processes after all slots are taken, then keep going until all
max-slots are taken. Examples:
- The hostfile:
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -bynode -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 2
vogon: MCW ranks 1, 3, 4, 5
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4
vogon: MCW ranks 1, 3, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until each
node's max_slots is hit, of course)
- orterun -bynode -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4, 6
vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11
2. "-byslot" (this is the default if you don't specify -bynode):
greedily takes all available slots on a node for a job before moving
on to the next node. If we still have processes to allocate/schedule,
then oversubscribe all nodes equally (i.e., go round robin on all
nodes until each node's max_slots is hit). Examples:
- The hostfile
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 1
vogon: MCW ranks 2, 3, 4, 5
- orterun -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2
vogon: MCW ranks 3, 4, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until max_slots
is hit)
- orterun -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2, 3
vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11
The above examples are fairly contrived, and it's not clear from them
that you can get different allocation answers in all cases (the
mapping differences are obvious). Consider the following allocation
example:
- The hostfile
eddie count=4
vogon count=4
earth count=4
deep-thought count=4
- orterun -np 8 -hostfile hostfile a.out
eddie: 4 slots will be allocated
vogon: 4 slots will be allocated
earth: no slots allocated
deep-thought: no slots allocated
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: 2 slots will be allocated
vogon: 2 slots will be allocated
earth: 2 slots will be allocated
deep-thought: 2 slots will be allocated
This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
{ NULL , NULL , NULL , ' \0 ' , " bynode " , " bynode " , 0 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . by_node , OPAL_CMD_LINE_TYPE_BOOL ,
2009-08-11 02:51:27 +00:00
" Whether to assign processes round-robin by node " } ,
(copied from a mail that has a lengthy description of this commit)
I spoke with Tim about this the other day -- he gave me the green
light to go ahead with this, but it turned into a bigger job than I
thought it would be. I revamped how the default RAS scheduling and
round_robin RMAPS mapping occurs. The previous algorithms were pretty
brain dead, and ignored the "slots" and "max_slots" tokens in
hostfiles. I considered this a big enough problem to fix it for the
beta (because there is currently no way to control where processes are
launched on SMPs).
There's still some more bells and whistles that I'd like to implement,
but there's no hurry, and they can go on the trunk at any time. My
patches below are for what I considered "essential", and do the
following:
- honor the "slots" and "max-slots" tokens in the hostfile (and all
their synonyms), meaning that we allocate/map until we fill slots,
and if there are still more processes to allocate/map, we keep going
until we fill max-slots (i.e., only oversubscribe a node if we have
to).
- offer two different algorithms, currently supported by two new
options to orterun. Remember that there are two parts here -- slot
allocation and process mapping. Slot allocation controls how many
processes we'll be running on a node. After that decision has been
made, process mapping effectively controls where the ranks of
MPI_COMM_WORLD (MCW) are placed. Some of the examples given below
don't make sense unless you remember that there is a difference
between the two (which makes total sense, but you have to think
about it in terms of both things):
1. "-bynode": allocates/maps one process per node in a round-robin
fashion until all slots on the node are taken. If we still have more
processes after all slots are taken, then keep going until all
max-slots are taken. Examples:
- The hostfile:
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -bynode -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 2
vogon: MCW ranks 1, 3, 4, 5
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4
vogon: MCW ranks 1, 3, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until each
node's max_slots is hit, of course)
- orterun -bynode -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4, 6
vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11
2. "-byslot" (this is the default if you don't specify -bynode):
greedily takes all available slots on a node for a job before moving
on to the next node. If we still have processes to allocate/schedule,
then oversubscribe all nodes equally (i.e., go round robin on all
nodes until each node's max_slots is hit). Examples:
- The hostfile
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 1
vogon: MCW ranks 2, 3, 4, 5
- orterun -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2
vogon: MCW ranks 3, 4, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until max_slots
is hit)
- orterun -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2, 3
vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11
The above examples are fairly contrived, and it's not clear from them
that you can get different allocation answers in all cases (the
mapping differences are obvious). Consider the following allocation
example:
- The hostfile
eddie count=4
vogon count=4
earth count=4
deep-thought count=4
- orterun -np 8 -hostfile hostfile a.out
eddie: 4 slots will be allocated
vogon: 4 slots will be allocated
earth: no slots allocated
deep-thought: no slots allocated
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: 2 slots will be allocated
vogon: 2 slots will be allocated
earth: 2 slots will be allocated
deep-thought: 2 slots will be allocated
This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
{ NULL , NULL , NULL , ' \0 ' , " byslot " , " byslot " , 0 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . by_slot , OPAL_CMD_LINE_TYPE_BOOL ,
2009-08-11 02:51:27 +00:00
" Whether to assign processes round-robin by slot (the default) " } ,
2009-09-22 18:44:53 +00:00
{ NULL , NULL , NULL , ' \0 ' , " bycore " , " bycore " , 0 ,
& orterun_globals . by_slot , OPAL_CMD_LINE_TYPE_BOOL ,
" Alias for byslot " } ,
2009-08-11 02:51:27 +00:00
{ NULL , NULL , NULL , ' \0 ' , " bysocket " , " bysocket " , 0 ,
& orterun_globals . by_socket , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to assign processes round-robin by socket " } ,
{ NULL , NULL , NULL , ' \0 ' , " byboard " , " byboard " , 0 ,
& orterun_globals . by_slot , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node) " } ,
2007-01-17 14:56:22 +00:00
{ " rmaps " , " base " , " pernode " , ' \0 ' , " pernode " , " pernode " , 0 ,
2006-12-13 04:51:38 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2006-12-12 00:54:05 +00:00
" Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes] " } ,
2007-01-17 14:56:22 +00:00
{ " rmaps " , " base " , " n_pernode " , ' \0 ' , " npernode " , " npernode " , 1 ,
2006-12-13 04:51:38 +00:00
NULL , OPAL_CMD_LINE_TYPE_INT ,
2006-12-12 00:54:05 +00:00
" Launch n processes per node on all allocated nodes " } ,
2008-07-28 14:18:36 +00:00
{ " rmaps " , " base " , " slot_list " , ' \0 ' , " slot-list " , " slot-list " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) " } ,
2007-01-17 14:56:22 +00:00
{ " rmaps " , " base " , " no_oversubscribe " , ' \0 ' , " nooversubscribe " , " nooversubscribe " , 0 ,
2006-12-13 04:51:38 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2006-07-10 21:25:33 +00:00
" Nodes are not to be oversubscribed, even if the system supports such operation " } ,
2008-04-23 14:52:09 +00:00
{ " rmaps " , " base " , " loadbalance " , ' \0 ' , " loadbalance " , " loadbalance " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Balance total number of procs across all allocated nodes " } ,
2006-12-13 13:49:15 +00:00
{ " rmaps " , " base " , " display_map " , ' \0 ' , " display-map " , " display-map " , 0 ,
2006-12-03 13:59:23 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display the process map just before launch " } ,
2008-09-23 15:46:34 +00:00
{ " rmaps " , " base " , " display_devel_map " , ' \0 ' , " display-devel-map " , " display-devel-map " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display a detailed process map (mostly intended for developers) just before launch " } ,
2008-05-29 14:11:31 +00:00
{ NULL , NULL , NULL , ' H ' , " host " , " host " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" List of hosts to invoke processes on " } ,
{ " rmaps " , " base " , " no_schedule_local " , ' \0 ' , " nolocal " , " nolocal " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Do not run any MPI applications on the local node " } ,
2009-08-30 14:30:36 +00:00
{ " rmaps " , " base " , " cpus_per_rank " , ' \0 ' , " cpus-per-proc " , " cpus-per-proc " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Number of cpus to use for each process [default=1] " } ,
2009-08-11 02:51:27 +00:00
{ " rmaps " , " base " , " cpus_per_rank " , ' \0 ' , " cpus-per-rank " , " cpus-per-rank " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
2009-08-30 14:30:36 +00:00
" Synonym for cpus-per-proc " } ,
2009-08-11 02:51:27 +00:00
{ " rmaps " , " base " , " n_perboard " , ' \0 ' , " nperboard " , " nperboard " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per board on all allocated nodes " } ,
{ " rmaps " , " base " , " n_persocket " , ' \0 ' , " npersocket " , " npersocket " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per socket on all allocated nodes " } ,
/* binding options */
2009-09-21 17:00:02 +00:00
{ NULL , NULL , NULL , ' \0 ' , " bind-to-none " , " bind-to-none " , 0 ,
2009-09-18 19:48:42 +00:00
& orterun_globals . bind_to_none , OPAL_CMD_LINE_TYPE_BOOL ,
" Do not bind processes to cores or sockets " } ,
2009-08-11 02:51:27 +00:00
{ NULL , NULL , NULL , ' \0 ' , " bind-to-core " , " bind-to-core " , 0 ,
& orterun_globals . bind_to_core , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to bind processes to specific cores (the default) " } ,
{ NULL , NULL , NULL , ' \0 ' , " bind-to-board " , " bind-to-board " , 0 ,
& orterun_globals . bind_to_board , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to bind processes to specific boards (meaningless on 1 board/node) " } ,
{ NULL , NULL , NULL , ' \0 ' , " bind-to-socket " , " bind-to-socket " , 0 ,
& orterun_globals . bind_to_socket , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to bind processes to sockets " } ,
{ " rmaps " , " base " , " stride " , ' \0 ' , " stride " , " stride " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" When binding multiple cores to a rank, the step size to use between cores [default: 1] " } ,
2009-09-28 03:17:15 +00:00
{ " orte " , " report " , " bindings " , ' \0 ' , " report-bindings " , " report-bindings " , 0 ,
2009-08-18 17:10:23 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2009-09-28 03:17:15 +00:00
" Whether to report process bindings to stderr " } ,
2009-08-11 02:51:27 +00:00
2008-05-29 14:11:31 +00:00
/* Allocation options */
2008-04-20 02:25:45 +00:00
{ " ras " , " base " , " display_alloc " , ' \0 ' , " display-allocation " , " display-allocation " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display the allocation being used by this job " } ,
2008-09-23 15:46:34 +00:00
{ " ras " , " base " , " display_devel_alloc " , ' \0 ' , " display-devel-allocation " , " display-devel-allocation " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display a detailed list (mostly intended for developers) of the allocation being used by this job " } ,
2009-08-11 02:51:27 +00:00
{ " orte " , " cpu " , " set " , ' \0 ' , " cpu-set " , " cpu-set " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Comma-separated list of ranges specifying logical cpus allocated to this job [default: none] " } ,
/* cluster hardware info */
{ " orte " , " num " , " boards " , ' \0 ' , " num-boards " , " num-boards " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Number of processor boards/node (1-256) [default: 1] " } ,
{ " orte " , " num " , " sockets " , ' \0 ' , " num-sockets " , " num-sockets " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Number of sockets/board (1-256) [default: 1] " } ,
{ " orte " , " num " , " cores " , ' \0 ' , " num-cores " , " num-cores " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Number of cores/socket (1-256) [default: 1] " } ,
2008-04-20 02:25:45 +00:00
2005-03-14 20:57:21 +00:00
/* mpiexec-like arguments */
{ NULL , NULL , NULL , ' \0 ' , " wdir " , " wdir " , 1 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . wdir , OPAL_CMD_LINE_TYPE_STRING ,
2005-03-14 20:57:21 +00:00
" Set the working directory of the started processes " } ,
2007-05-08 19:09:32 +00:00
{ NULL , NULL , NULL , ' \0 ' , " wd " , " wd " , 1 ,
& orterun_globals . wdir , OPAL_CMD_LINE_TYPE_STRING ,
" Synonym for --wdir " } ,
2005-03-14 20:57:21 +00:00
{ NULL , NULL , NULL , ' \0 ' , " path " , " path " , 1 ,
2005-07-04 00:13:44 +00:00
& orterun_globals . path , OPAL_CMD_LINE_TYPE_STRING ,
2005-03-14 20:57:21 +00:00
" PATH to be used to look for executables to start processes " } ,
2006-07-04 20:12:35 +00:00
2005-11-20 16:06:53 +00:00
/* User-level debugger arguments */
{ NULL , NULL , NULL , ' \0 ' , " tv " , " tv " , 0 ,
& orterun_globals . debugger , OPAL_CMD_LINE_TYPE_BOOL ,
" Deprecated backwards compatibility flag; synonym for \" --debug \" " } ,
{ NULL , NULL , NULL , ' \0 ' , " debug " , " debug " , 0 ,
& orterun_globals . debugger , OPAL_CMD_LINE_TYPE_BOOL ,
" Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter " } ,
{ " orte " , " base " , " user_debugger " , ' \0 ' , " debugger " , " debugger " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Sequence of debuggers to search for when \" --debug \" is used " } ,
2005-05-12 21:44:23 +00:00
/* OpenRTE arguments */
2008-08-14 18:59:01 +00:00
{ " orte " , " debug " , NULL , ' d ' , " debug-devel " , " debug-devel " , 0 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2005-05-12 21:44:23 +00:00
" Enable debugging of OpenRTE " } ,
2006-10-11 15:18:57 +00:00
2008-08-14 18:59:01 +00:00
{ " orte " , " debug " , " daemons " , ' \0 ' , " debug-daemons " , " debug-daemons " , 0 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_INT ,
2005-05-12 21:44:23 +00:00
" Enable debugging of any OpenRTE daemons used by this application " } ,
2006-10-11 15:18:57 +00:00
2008-08-14 18:59:01 +00:00
{ " orte " , " debug " , " daemons_file " , ' \0 ' , " debug-daemons-file " , " debug-daemons-file " , 0 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
2005-05-12 21:44:23 +00:00
" Enable debugging of any OpenRTE daemons used by this application, storing output in files " } ,
2006-10-11 15:18:57 +00:00
2008-08-14 18:59:01 +00:00
{ " orte " , " leave " , " session_attached " , ' \0 ' , " leave-session-attached " , " leave-session-attached " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable debugging of OpenRTE " } ,
{ NULL , NULL , NULL , ' \0 ' , " tmpdir " , " tmpdir " , 1 ,
2009-03-05 21:56:03 +00:00
& orte_process_info . tmpdir_base , OPAL_CMD_LINE_TYPE_STRING ,
2005-05-12 21:44:23 +00:00
" Set the root for the session directory tree for orterun ONLY " } ,
2008-08-14 18:59:01 +00:00
{ " orte " , " do_not " , " launch " , ' \0 ' , " do-not-launch " , " do-not-launch " , 0 ,
2008-04-17 13:50:59 +00:00
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Perform all necessary operations to prepare to launch the application, but do not actually launch it " } ,
2006-12-13 04:51:38 +00:00
2006-02-28 11:52:12 +00:00
{ NULL , NULL , NULL , ' \0 ' , NULL , " prefix " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Prefix where Open MPI is installed on remote nodes " } ,
2006-10-06 13:02:56 +00:00
{ NULL , NULL , NULL , ' \0 ' , NULL , " noprefix " , 0 ,
2006-09-15 02:52:08 +00:00
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Disable automatic --prefix behavior " } ,
2006-03-23 16:53:11 +00:00
2009-06-02 23:52:59 +00:00
{ " orte " , " report " , " launch_progress " , ' \0 ' , " show-progress " , " show-progress " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Output a brief periodic report on launch progress " } ,
2009-06-23 20:25:38 +00:00
{ " orte " , " use " , " regexp " , ' \0 ' , " use-regexp " , " use-regexp " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Use regular expressions for launch " } ,
2009-09-09 05:28:45 +00:00
{ " orte " , " report " , " events " , ' \0 ' , " report-events " , " report-events " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Report events to a tool listening at the specified URI " } ,
2005-03-14 20:57:21 +00:00
/* End of list */
{ NULL , NULL , NULL , ' \0 ' , NULL , NULL , 0 ,
2005-07-04 00:13:44 +00:00
NULL , OPAL_CMD_LINE_TYPE_NULL , NULL }
2005-03-14 20:57:21 +00:00
} ;
/*
* Local functions
*/
2008-02-28 01:57:57 +00:00
static void job_completed ( int trigpipe , short event , void * arg ) ;
static void abort_signal_callback ( int fd , short flags , void * arg ) ;
static void abort_exit_callback ( int fd , short event , void * arg ) ;
2006-06-26 15:12:52 +00:00
static void signal_forward_callback ( int fd , short event , void * arg ) ;
2005-03-14 20:57:21 +00:00
static int create_app ( int argc , char * argv [ ] , orte_app_context_t * * app ,
2005-08-08 16:42:28 +00:00
bool * made_app , char * * * app_env ) ;
2005-03-14 20:57:21 +00:00
static int init_globals ( void ) ;
2007-06-27 01:03:31 +00:00
static int parse_globals ( int argc , char * argv [ ] , opal_cmd_line_t * cmd_line ) ;
2005-03-14 20:57:21 +00:00
static int parse_locals ( int argc , char * argv [ ] ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
static int parse_appfile ( char * filename , char * * * env ) ;
2008-02-28 01:57:57 +00:00
static void dump_aborted_procs ( void ) ;
2009-04-30 15:08:02 +00:00
static void just_quit ( int fd , short ign , void * arg ) ;
2005-03-14 20:57:21 +00:00
2005-08-31 16:15:59 +00:00
int orterun ( int argc , char * argv [ ] )
2005-03-14 20:57:21 +00:00
{
2008-02-28 01:57:57 +00:00
int rc ;
2007-06-27 01:03:31 +00:00
opal_cmd_line_t cmd_line ;
2008-04-23 00:17:12 +00:00
char * tmp_env_var = NULL ;
2005-03-14 20:57:21 +00:00
2007-06-27 01:03:31 +00:00
/* find our basename (the name of the executable) so that we can
use it in pretty - print error messages */
orterun_basename = opal_basename ( argv [ 0 ] ) ;
2007-04-21 00:15:05 +00:00
2007-06-27 01:03:31 +00:00
/* Setup and parse the command line */
init_globals ( ) ;
opal_cmd_line_create ( & cmd_line , cmd_line_init ) ;
mca_base_cmd_line_setup ( & cmd_line ) ;
2008-02-28 01:57:57 +00:00
if ( ORTE_SUCCESS ! = ( rc = opal_cmd_line_parse ( & cmd_line , true ,
2007-06-27 01:03:31 +00:00
argc , argv ) ) ) {
2008-02-28 01:57:57 +00:00
return rc ;
2007-06-27 01:03:31 +00:00
}
2007-04-21 00:15:05 +00:00
2008-12-09 23:49:02 +00:00
/*
* Since this process can now handle MCA / GMCA parameters , make sure to
* process them .
*/
mca_base_cmd_line_process_args ( & cmd_line , & environ , & environ ) ;
/* make sure that opal_profile is -not- set for us locally as
* we really only want to profile MPI apps . However , if it is
* set , remember it so we can add it to the apps environment later
*/
if ( NULL ! = getenv ( " OMPI_MCA_opal_profile " ) ) {
putenv ( " OMPI_MCA_opal_profile=0 " ) ;
profile_is_set = true ;
/* ensure that I know to turn on my profile receive! */
putenv ( " OMPI_MCA_orte_grpcomm_recv_on=1 " ) ;
}
/* Ensure that enough of OPAL is setup for us to be able to run */
/*
* NOTE : ( JJH )
* We need to allow ' mca_base_cmd_line_process_args ( ) ' to process command
* line arguments * before * calling opal_init_util ( ) since the command
* line could contain MCA parameters that affect the way opal_init_util ( )
* functions . AMCA parameters are one such option normally received on the
* command line that affect the way opal_init_util ( ) behaves .
* It is " safe " to call mca_base_cmd_line_process_args ( ) before
* opal_init_util ( ) since mca_base_cmd_line_process_args ( ) does * not *
* depend upon opal_init_util ( ) functionality .
*/
2007-06-27 01:03:31 +00:00
/* Need to initialize OPAL so that install_dirs are filled in */
2008-05-19 11:58:48 +00:00
if ( OPAL_SUCCESS ! = opal_init_util ( ) ) {
exit ( 1 ) ;
}
2007-07-13 15:47:57 +00:00
2008-08-06 11:31:06 +00:00
/* setup the exit triggers */
OBJ_CONSTRUCT ( & orte_exit , orte_trigger_event_t ) ;
OBJ_CONSTRUCT ( & orteds_exit , orte_trigger_event_t ) ;
2008-08-06 21:53:35 +00:00
2009-05-04 11:07:40 +00:00
/* flag that I am the HNP - needs to be done prior to
* registering params
*/
orte_process_info . proc_type = ORTE_PROC_HNP ;
2007-04-21 00:15:05 +00:00
/* Setup MCA params */
2008-06-19 02:58:14 +00:00
orte_register_params ( ) ;
2008-06-24 17:50:56 +00:00
2005-03-14 20:57:21 +00:00
/* Check for some "global" command line params */
2007-06-27 01:03:31 +00:00
parse_globals ( argc , argv , & cmd_line ) ;
OBJ_DESTRUCT ( & cmd_line ) ;
2005-03-14 20:57:21 +00:00
2008-02-28 01:57:57 +00:00
/* create a new job object to hold the info for this one - the
* jobid field will be filled in by the PLM when the job is
* launched
*/
jdata = OBJ_NEW ( orte_job_t ) ;
if ( NULL = = jdata ) {
ORTE_ERROR_LOG ( ORTE_ERR_OUT_OF_RESOURCE ) ;
return ORTE_ERR_OUT_OF_RESOURCE ;
2005-07-08 18:48:25 +00:00
}
2009-08-11 02:51:27 +00:00
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
/* check what user wants us to do with stdin */
if ( 0 = = strcmp ( orterun_globals . stdin_target , " all " ) ) {
jdata - > stdin_target = ORTE_VPID_WILDCARD ;
} else if ( 0 = = strcmp ( orterun_globals . stdin_target , " none " ) ) {
jdata - > stdin_target = ORTE_VPID_INVALID ;
} else {
jdata - > stdin_target = strtoul ( orterun_globals . stdin_target , NULL , 10 ) ;
}
2008-02-28 01:57:57 +00:00
/* Parse each app, adding it to the job object */
parse_locals ( argc , argv ) ;
if ( 0 = = jdata - > num_apps ) {
2005-07-08 18:48:25 +00:00
/* This should never happen -- this case should be caught in
2008-02-28 01:57:57 +00:00
create_app ( ) , but let ' s just double check . . . */
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:nothing-to-do " ,
2005-07-08 18:48:25 +00:00
true , orterun_basename ) ;
2008-03-05 01:46:30 +00:00
exit ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
2005-04-15 21:52:58 +00:00
}
2005-03-14 20:57:21 +00:00
2008-07-08 22:36:39 +00:00
/* save the environment for launch purposes. This MUST be
* done so that we can pass it to any local procs we
* spawn - otherwise , those local procs won ' t see any
* non - MCA envars were set in the enviro prior to calling
* orterun
*/
orte_launch_environ = opal_argv_copy ( environ ) ;
2007-03-16 23:11:45 +00:00
# if OPAL_ENABLE_FT == 1
/* Disable OPAL CR notifications for this tool */
opal_cr_set_enabled ( false ) ;
2008-04-23 00:17:12 +00:00
tmp_env_var = mca_base_param_env_var ( " opal_cr_is_tool " ) ;
opal_setenv ( tmp_env_var ,
2007-10-17 13:47:36 +00:00
" 1 " ,
true , & environ ) ;
2008-04-23 00:17:12 +00:00
free ( tmp_env_var ) ;
2007-03-16 23:11:45 +00:00
# endif
2008-04-23 00:17:12 +00:00
tmp_env_var = NULL ; /* Silence compiler warning */
2008-02-28 01:57:57 +00:00
/* Intialize our Open RTE environment
* Set the flag telling orte_init that I am NOT a
2005-06-24 16:59:37 +00:00
* singleton , but am " infrastructure " - prevents setting
* up incorrect infrastructure that only a singleton would
* require
*/
2009-05-04 11:07:40 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_init ( ORTE_PROC_HNP ) ) ) {
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 18:31:28 +00:00
ORTE_ERROR_LOG ( rc ) ;
2005-03-14 20:57:21 +00:00
return rc ;
2007-07-13 15:47:57 +00:00
}
2009-09-02 18:03:10 +00:00
2008-12-10 17:10:39 +00:00
/* check for request to report uri */
2008-12-24 15:27:46 +00:00
if ( NULL ! = orterun_globals . report_uri ) {
FILE * fp ;
char * rml_uri ;
rml_uri = orte_rml . get_contact_info ( ) ;
if ( 0 = = strcmp ( orterun_globals . report_uri , " - " ) ) {
/* if '-', then output to stdout */
printf ( " %s \n " , ( NULL = = rml_uri ) ? " NULL " : rml_uri ) ;
} else if ( 0 = = strcmp ( orterun_globals . report_uri , " + " ) ) {
/* if '+', output to stderr */
fprintf ( stderr , " %s \n " , ( NULL = = rml_uri ) ? " NULL " : rml_uri ) ;
} else {
fp = fopen ( orterun_globals . report_uri , " w " ) ;
if ( NULL = = fp ) {
orte_show_help ( " help-orterun.txt " , " orterun:write_file " , false ,
orterun_basename , " uri " , orterun_globals . report_uri ) ;
exit ( 0 ) ;
}
fprintf ( fp , " %s \n " , ( NULL = = rml_uri ) ? " NULL " : rml_uri ) ;
fclose ( fp ) ;
2008-12-10 17:10:39 +00:00
}
2008-12-24 15:27:46 +00:00
if ( NULL ! = rml_uri ) {
free ( rml_uri ) ;
}
2008-12-10 17:10:39 +00:00
}
2008-10-24 01:42:58 +00:00
/* Change the default behavior of libevent such that we want to
continually block rather than blocking for the default timeout
and then looping around the progress engine again . There
should be nothing in the orted that cannot block in libevent
until " something " happens ( i . e . , there ' s no need to keep
cycling through progress because the only things that should
happen will happen in libevent ) . This is a minor optimization ,
but what the heck . . . : - ) */
opal_progress_set_event_flag ( OPAL_EVLOOP_ONCE ) ;
2008-10-16 14:58:32 +00:00
/* setup an event we can wait for that will tell
* us to terminate - both normal and abnormal
* termination will call us here . Use the
* same exit fd as the daemon does so that orted_comm
* can cause either of us to exit since we share that code
*/
if ( ORTE_SUCCESS ! = ( rc = orte_wait_event ( & orterun_event , & orte_exit , " job_complete " , job_completed ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:event-def-failed " , true ,
orterun_basename , ORTE_ERROR_NAME ( rc ) ) ;
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
goto DONE ;
}
/* setup an event that will
* trigger when the orteds are gone and tell the orteds that it is
* okay to finalize and exit , we are done with them .
* We set this up here in order to provide a way for us to
* wakeup and terminate should the daemons themselves fail to launch ,
* and before we define signal handlers since they will call the
* exit event trigger !
*/
2009-05-11 14:11:44 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_wait_event ( & orteds_exit_event , & orteds_exit , " orted_exit " , just_quit ) ) ) {
2008-10-16 14:58:32 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:event-def-failed " , true ,
orterun_basename , ORTE_ERROR_NAME ( rc ) ) ;
goto DONE ;
}
2009-08-27 08:11:56 +00:00
# ifndef __WINDOWS__
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
/** setup callbacks for abort signals - from this point
* forward , we need to abort in a manner that allows us
* to cleanup
*/
opal_signal_set ( & term_handler , SIGTERM ,
abort_signal_callback , & term_handler ) ;
opal_signal_add ( & term_handler , NULL ) ;
opal_signal_set ( & int_handler , SIGINT ,
abort_signal_callback , & int_handler ) ;
opal_signal_add ( & int_handler , NULL ) ;
2009-01-30 18:50:10 +00:00
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
/** setup callbacks for signals we should foward */
opal_signal_set ( & sigusr1_handler , SIGUSR1 ,
signal_forward_callback , & sigusr1_handler ) ;
opal_signal_add ( & sigusr1_handler , NULL ) ;
opal_signal_set ( & sigusr2_handler , SIGUSR2 ,
signal_forward_callback , & sigusr2_handler ) ;
opal_signal_add ( & sigusr2_handler , NULL ) ;
2009-01-30 18:50:10 +00:00
if ( orte_forward_job_control ) {
opal_signal_set ( & sigtstp_handler , SIGTSTP ,
signal_forward_callback , & sigtstp_handler ) ;
opal_signal_add ( & sigtstp_handler , NULL ) ;
opal_signal_set ( & sigcont_handler , SIGCONT ,
signal_forward_callback , & sigcont_handler ) ;
opal_signal_add ( & sigcont_handler , NULL ) ;
}
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
# endif /* __WINDOWS__ */
2009-02-25 03:10:21 +00:00
signals_set = true ;
2007-07-19 19:00:06 +00:00
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables in our copy . This
will ensure that any locally - spawned children will
have our executables and libraries in their path
For now , default to the prefix_dir provided in the first app_context .
Since there always MUST be at least one app_context , we are safe in
doing this .
2008-02-28 01:57:57 +00:00
*/
if ( NULL ! = ( ( orte_app_context_t * ) jdata - > apps - > addr [ 0 ] ) - > prefix_dir ) {
2007-07-19 19:00:06 +00:00
char * oldenv , * newenv , * lib_base , * bin_base ;
lib_base = opal_basename ( opal_install_dirs . libdir ) ;
bin_base = opal_basename ( opal_install_dirs . bindir ) ;
/* Reset PATH */
2008-02-28 01:57:57 +00:00
newenv = opal_os_path ( false , ( ( orte_app_context_t * ) jdata - > apps - > addr [ 0 ] ) - > prefix_dir , bin_base , NULL ) ;
2007-07-19 19:00:06 +00:00
oldenv = getenv ( " PATH " ) ;
if ( NULL ! = oldenv ) {
char * temp ;
asprintf ( & temp , " %s:%s " , newenv , oldenv ) ;
free ( newenv ) ;
newenv = temp ;
}
opal_setenv ( " PATH " , newenv , true , & orte_launch_environ ) ;
if ( orte_debug_flag ) {
2008-06-09 14:53:58 +00:00
opal_output ( 0 , " %s: reset PATH: %s " , orterun_basename , newenv ) ;
2007-07-19 19:00:06 +00:00
}
free ( newenv ) ;
free ( bin_base ) ;
/* Reset LD_LIBRARY_PATH */
2008-02-28 01:57:57 +00:00
newenv = opal_os_path ( false , ( ( orte_app_context_t * ) jdata - > apps - > addr [ 0 ] ) - > prefix_dir , lib_base , NULL ) ;
2007-07-19 19:00:06 +00:00
oldenv = getenv ( " LD_LIBRARY_PATH " ) ;
if ( NULL ! = oldenv ) {
char * temp ;
asprintf ( & temp , " %s:%s " , newenv , oldenv ) ;
free ( newenv ) ;
newenv = temp ;
}
opal_setenv ( " LD_LIBRARY_PATH " , newenv , true , & orte_launch_environ ) ;
if ( orte_debug_flag ) {
2008-06-09 14:53:58 +00:00
opal_output ( 0 , " %s: reset LD_LIBRARY_PATH: %s " ,
2007-07-19 19:00:06 +00:00
orterun_basename , newenv ) ;
}
free ( newenv ) ;
free ( lib_base ) ;
}
2006-09-14 15:27:17 +00:00
/* pre-condition any network transports that require it */
2008-02-28 01:57:57 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_pre_condition_transports ( jdata ) ) ) {
2006-09-14 15:27:17 +00:00
ORTE_ERROR_LOG ( rc ) ;
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:precondition " , false ,
2006-09-14 15:27:17 +00:00
orterun_basename , NULL , NULL , rc ) ;
2009-02-25 03:10:21 +00:00
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
goto DONE ;
2006-09-14 15:27:17 +00:00
}
2007-07-12 19:53:18 +00:00
/* setup to listen for commands sent specifically to me, even though I would probably
* be the one sending them ! Unfortunately , since I am a participating daemon ,
* there are times I need to send a command to " all daemons " , and that means * I * have
* to receive it too
*/
2008-02-28 01:57:57 +00:00
rc = orte_rml . recv_buffer_nb ( ORTE_NAME_WILDCARD , ORTE_RML_TAG_DAEMON ,
ORTE_RML_NON_PERSISTENT , orte_daemon_recv , NULL ) ;
2007-07-12 19:53:18 +00:00
if ( rc ! = ORTE_SUCCESS & & rc ! = ORTE_ERR_NOT_IMPLEMENTED ) {
ORTE_ERROR_LOG ( rc ) ;
2009-02-25 03:10:21 +00:00
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
goto DONE ;
2007-07-12 19:53:18 +00:00
}
2008-02-28 01:57:57 +00:00
/* setup the data server */
if ( ORTE_SUCCESS ! = ( rc = orte_data_server_init ( ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
2009-02-25 03:10:21 +00:00
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
goto DONE ;
2006-12-13 04:51:38 +00:00
}
2008-02-28 01:57:57 +00:00
2008-04-16 14:27:42 +00:00
/* if an uri for the ompi-server was provided, set the route */
if ( NULL ! = ompi_server ) {
opal_buffer_t buf ;
/* setup our route to the server */
OBJ_CONSTRUCT ( & buf , opal_buffer_t ) ;
opal_dss . pack ( & buf , & ompi_server , 1 , OPAL_STRING ) ;
2009-05-14 00:25:02 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_rml_base_update_contact_info ( & buf ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
goto DONE ;
}
2008-04-16 14:27:42 +00:00
OBJ_DESTRUCT ( & buf ) ;
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
/* check if we are to wait for the server to start - resolves
* a race condition that can occur when the server is run
* as a background job - e . g . , in scripts
*/
if ( orterun_globals . wait_for_server ) {
/* ping the server */
struct timeval timeout ;
timeout . tv_sec = orterun_globals . server_wait_timeout ;
timeout . tv_usec = 0 ;
if ( ORTE_SUCCESS ! = ( rc = orte_rml . ping ( ompi_server , & timeout ) ) ) {
/* try it one more time */
if ( ORTE_SUCCESS ! = ( rc = orte_rml . ping ( ompi_server , & timeout ) ) ) {
/* okay give up */
orte_show_help ( " help-orterun.txt " , " orterun:server-not-found " , true ,
orterun_basename , ompi_server ,
( long ) orterun_globals . server_wait_timeout ,
ORTE_ERROR_NAME ( rc ) ) ;
2009-02-25 03:10:21 +00:00
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
goto DONE ;
}
}
}
2008-04-16 14:27:42 +00:00
}
2008-07-29 17:39:16 +00:00
/* setup for debugging */
2008-06-18 15:28:46 +00:00
orte_debugger_init_before_spawn ( jdata ) ;
2007-07-12 19:53:18 +00:00
2008-02-28 01:57:57 +00:00
/* Spawn the job */
rc = orte_plm . spawn ( jdata ) ;
2007-05-18 13:29:11 +00:00
2008-07-29 17:39:16 +00:00
/* complete debugger interface */
2008-06-18 15:28:46 +00:00
orte_debugger_init_after_spawn ( jdata ) ;
2008-02-28 01:57:57 +00:00
/* now wait until the termination event fires */
opal_event_dispatch ( ) ;
/* we only reach this point by jumping there due
* to an error - so just cleanup and leave
*/
2009-02-25 03:10:21 +00:00
DONE :
2009-04-30 15:08:02 +00:00
ORTE_UPDATE_EXIT_STATUS ( orte_exit_status ) ;
just_quit ( 0 , 0 , NULL ) ;
2008-02-28 01:57:57 +00:00
return orte_exit_status ;
}
static void job_completed ( int trigpipe , short event , void * arg )
{
int rc ;
2008-05-29 13:38:27 +00:00
orte_job_t * daemons ;
2008-02-28 01:57:57 +00:00
2008-06-02 21:46:34 +00:00
/* if the abort exit event is set, delete it */
if ( NULL ! = abort_exit_event ) {
2008-06-10 17:53:28 +00:00
opal_evtimer_del ( abort_exit_event ) ;
free ( abort_exit_event ) ;
2008-06-02 21:46:34 +00:00
}
2008-06-10 17:53:28 +00:00
2008-08-19 15:19:30 +00:00
/* if we never launched, just skip this part to avoid
* meaningless error messages
*/
if ( orte_never_launched ) {
rc = orte_exit_status ;
goto DONE ;
}
2009-03-03 16:39:13 +00:00
if ( 0 ! = orte_exit_status ) {
2007-05-18 13:29:11 +00:00
/* abnormal termination of some kind */
2008-02-28 01:57:57 +00:00
dump_aborted_procs ( ) ;
2007-05-18 13:29:11 +00:00
/* If we showed more abort messages than were allowed,
show a followup message here */
2008-02-28 01:57:57 +00:00
if ( num_failed_start > 1 ) {
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " <stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " %d total process%s failed to start " ,
2008-02-28 01:57:57 +00:00
num_failed_start , ( ( num_failed_start > 1 ) ? " es " : " " ) ) ;
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " 
</stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " \n " ) ;
2005-03-14 20:57:21 +00:00
}
2008-02-28 01:57:57 +00:00
if ( num_aborted > 1 ) {
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " <stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " %d total process%s aborted " ,
2008-02-28 01:57:57 +00:00
num_aborted , ( ( num_aborted > 1 ) ? " es " : " " ) ) ;
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " 
</stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " \n " ) ;
2008-02-28 01:57:57 +00:00
}
if ( num_killed > 1 ) {
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " <stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " %d total process%s killed (some possibly by %s during cleanup) " ,
2008-02-28 01:57:57 +00:00
num_killed , ( ( num_killed > 1 ) ? " es " : " " ) , orterun_basename ) ;
2009-08-28 02:46:15 +00:00
if ( orte_xml_output ) {
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " 
</stderr> " ) ;
2009-08-28 02:46:15 +00:00
}
2009-09-02 18:03:10 +00:00
fprintf ( orte_xml_fp , " \n " ) ;
2007-05-18 13:29:11 +00:00
}
}
2008-06-09 20:34:14 +00:00
/* if the debuggers were run, clean up */
2008-06-18 15:28:46 +00:00
orte_debugger_finalize ( ) ;
2008-10-16 14:58:32 +00:00
2008-06-10 17:53:28 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_plm . terminate_orteds ( ) ) ) {
2008-02-28 01:57:57 +00:00
/* since we know that the sends didn't completely go out,
2009-05-11 14:11:44 +00:00
* we know that the barrier will never complete . Add a timeout so
2008-02-28 01:57:57 +00:00
* that those daemons that can respond have a chance to do
* so
When we can detect that a daemon has failed, then we would like to terminate the system without having it lock up. The "hang" is currently caused by the system attempting to send messages to the daemons (specifically, ordering them to kill their local procs and then terminate). Unfortunately, without some idea of which daemon has died, the system hangs while attempting to send a message to someone who is no longer alive.
This commit introduces the necessary logic to avoid that conflict. If a PLS component can identify that a daemon has failed, then we will set a flag indicating that fact. The xcast system will subsequently check that flag and, if it is set, will send all messages direct to the recipient. In the case of "kill local procs" and "terminate", the messages will go directly to each orted, thus bypassing any orted that has failed.
In addition, the xcast system will -not- wait for the messages to complete, but will return immediately (i.e., operate in non-blocking mode). Orterun will wait (via an event timer) for a period of time based on the number of daemons in the system to allow the messages to attempt to be delivered - at the end of that time, orterun will simply exit, alerting the user to the problem and -strongly- recommending they run orte-clean.
I could only test this on slurm for the case where all daemons unexpectedly died - srun apparently only executes its waitpid callback when all launched functions terminate. I have asked that Jeff integrate this capability into the OOB as he is working on it so that we execute it whenever a socket to an orted is unexpectedly closed. Meantime, the functionality will rarely get called, but at least the logic is available for anyone whose environment can support it.
This commit was SVN r16451.
2007-10-15 18:00:30 +00:00
*/
2008-05-29 13:38:27 +00:00
/* get the orted job data object */
if ( NULL = = ( daemons = orte_get_job_data_object ( ORTE_PROC_MY_NAME - > jobid ) ) ) {
/* we are totally hozed */
goto DONE ;
}
2008-06-10 17:53:28 +00:00
ORTE_DETECT_TIMEOUT ( & timeout_ev , daemons - > num_procs ,
2008-02-28 01:57:57 +00:00
orte_timeout_usec_per_proc ,
2009-05-11 14:11:44 +00:00
orte_max_timeout , just_quit ) ;
2007-05-18 13:29:11 +00:00
}
2008-02-28 01:57:57 +00:00
2009-05-11 14:11:44 +00:00
/* ensure all the orteds depart together */
orte_grpcomm . onesided_barrier ( ) ;
2008-02-28 01:57:57 +00:00
DONE :
2009-04-30 15:08:02 +00:00
ORTE_UPDATE_EXIT_STATUS ( rc ) ;
just_quit ( 0 , 0 , NULL ) ;
2008-02-28 01:57:57 +00:00
}
2007-05-18 13:29:11 +00:00
2009-05-11 14:11:44 +00:00
static void just_quit ( int fd , short ign , void * arg )
2008-02-28 01:57:57 +00:00
{
2009-05-11 14:11:44 +00:00
/* if the orted exit event is set, delete it */
if ( NULL ! = orteds_exit_event ) {
opal_evtimer_del ( orteds_exit_event ) ;
free ( orteds_exit_event ) ;
2009-02-25 03:10:21 +00:00
}
2008-02-28 01:57:57 +00:00
2009-04-30 15:08:02 +00:00
if ( signals_set ) {
/* Remove the TERM and INT signal handlers */
opal_signal_del ( & term_handler ) ;
opal_signal_del ( & int_handler ) ;
# ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del ( & sigusr1_handler ) ;
opal_signal_del ( & sigusr2_handler ) ;
if ( orte_forward_job_control ) {
opal_signal_del ( & sigtstp_handler ) ;
opal_signal_del ( & sigcont_handler ) ;
}
# endif /* __WINDOWS__ */
signals_set = false ;
}
2008-02-28 01:57:57 +00:00
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup ( ORTE_JOBID_WILDCARD ) ;
2009-04-30 15:08:02 +00:00
2008-02-28 01:57:57 +00:00
/* cleanup our data server */
orte_data_server_finalize ( ) ;
2009-04-30 15:08:02 +00:00
/* cleanup and leave */
2005-03-14 20:57:21 +00:00
orte_finalize ( ) ;
2009-08-18 03:15:29 +00:00
2005-04-13 15:26:33 +00:00
free ( orterun_basename ) ;
2008-06-09 19:21:20 +00:00
if ( orte_debug_flag ) {
fprintf ( stderr , " orterun: exiting with status %d \n " , orte_exit_status ) ;
}
2008-02-28 01:57:57 +00:00
exit ( orte_exit_status ) ;
2005-03-14 20:57:21 +00:00
}
2009-04-30 15:08:02 +00:00
2005-03-31 19:39:02 +00:00
/*
2005-09-04 20:54:19 +00:00
* On abnormal termination - dump the
2005-03-31 19:39:02 +00:00
* exit status of the aborted procs .
*/
2008-02-28 01:57:57 +00:00
static void dump_aborted_procs ( void )
2005-03-31 19:39:02 +00:00
{
2008-02-28 01:57:57 +00:00
orte_std_cntr_t i , n ;
orte_proc_t * proc , * * procs ;
orte_app_context_t * * apps ;
orte_job_t * * jobs , * job ;
bool found = false ;
2007-04-24 19:19:14 +00:00
2008-02-28 01:57:57 +00:00
/* find the job that caused the problem - be sure to start the loop
* at 1 as the daemons are in 0 and will clearly be " running " , so no
* point in checking them
*/
jobs = ( orte_job_t * * ) orte_job_data - > addr ;
for ( n = 1 ; n < orte_job_data - > size ; n + + ) {
if ( NULL = = jobs [ n ] ) {
2009-03-03 16:39:13 +00:00
/* the array is no longer left-justified, so we have to continue */
continue ;
2008-02-28 01:57:57 +00:00
}
if ( ORTE_JOB_STATE_UNDEF ! = jobs [ n ] - > state & &
ORTE_JOB_STATE_INIT ! = jobs [ n ] - > state & &
ORTE_JOB_STATE_LAUNCHED ! = jobs [ n ] - > state & &
ORTE_JOB_STATE_RUNNING ! = jobs [ n ] - > state & &
ORTE_JOB_STATE_TERMINATED ! = jobs [ n ] - > state & &
ORTE_JOB_STATE_ABORT_ORDERED ! = jobs [ n ] - > state ) {
/* this is a guilty party */
job = jobs [ n ] ;
proc = job - > aborted_proc ;
procs = ( orte_proc_t * * ) job - > procs - > addr ;
apps = ( orte_app_context_t * * ) job - > apps - > addr ;
/* flag that we found at least one */
found = true ;
/* cycle through and count the number that were killed or aborted */
for ( i = 0 ; i < job - > procs - > size ; i + + ) {
if ( NULL = = procs [ i ] ) {
/* array is left-justfied - we are done */
break ;
2006-02-07 03:32:36 +00:00
}
2008-02-28 01:57:57 +00:00
if ( ORTE_PROC_STATE_FAILED_TO_START = = procs [ i ] - > state ) {
+ + num_failed_start ;
} else if ( ORTE_PROC_STATE_ABORTED = = procs [ i ] - > state ) {
+ + num_aborted ;
} else if ( ORTE_PROC_STATE_ABORTED_BY_SIG = = procs [ i ] - > state ) {
+ + num_killed ;
2006-02-07 03:32:36 +00:00
}
2005-03-31 19:39:02 +00:00
}
2008-02-28 01:57:57 +00:00
if ( ORTE_JOB_STATE_FAILED_TO_START = = job - > state ) {
if ( NULL = = proc ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-failed-to-start-no-status-no-node " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename ) ;
return ;
2007-04-24 19:19:14 +00:00
}
2008-02-28 01:57:57 +00:00
if ( ORTE_ERR_SYS_LIMITS_PIPES = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:sys-limit-pipe " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name ,
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_PIPE_SETUP_FAILURE = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:pipe-setup-failure " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name ,
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_SYS_LIMITS_CHILDREN = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:sys-limit-children " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name ,
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_FAILED_GET_TERM_ATTRS = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:failed-term-attrs " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name ,
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_WDIR_NOT_FOUND = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:wdir-not-found " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , apps [ proc - > app_idx ] - > cwd ,
proc - > node - > name , ( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_EXE_NOT_FOUND = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:exe-not-found " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , apps [ proc - > app_idx ] - > app ,
proc - > node - > name , ( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_EXE_NOT_ACCESSIBLE = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:exe-not-accessible " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , apps [ proc - > app_idx ] - > app , proc - > node - > name ,
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_PIPE_READ_FAILURE = = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:pipe-read-failure " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name , ( unsigned long ) proc - > name . vpid ) ;
} else if ( 0 ! = proc - > exit_code ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-failed-to-start " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , ORTE_ERROR_NAME ( proc - > exit_code ) , proc - > node - > name ,
2009-08-22 02:58:20 +00:00
( unsigned long ) proc - > name . vpid ) ;
} else if ( ORTE_ERR_SOCKET_NOT_AVAILABLE = = proc - > exit_code ) {
orte_show_help ( " help-orterun.txt " , " orterun:proc-socket-not-avail " , true ,
orterun_basename , ORTE_ERROR_NAME ( proc - > exit_code ) , proc - > node - > name ,
2008-02-28 01:57:57 +00:00
( unsigned long ) proc - > name . vpid ) ;
} else {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-failed-to-start-no-status " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , proc - > node - > name ) ;
2007-04-24 19:19:14 +00:00
}
2008-02-28 01:57:57 +00:00
} else if ( ORTE_JOB_STATE_ABORTED = = job - > state ) {
if ( NULL = = proc ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-aborted-unknown " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename ) ;
} else {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-ordered-abort " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , ( unsigned long ) proc - > name . vpid , ( unsigned long ) proc - > pid ,
proc - > node - > name , orterun_basename ) ;
2007-04-24 19:19:14 +00:00
}
2008-02-28 01:57:57 +00:00
} else if ( ORTE_JOB_STATE_ABORTED_BY_SIG = = job - > state ) { /* aborted by signal */
if ( NULL = = proc ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-aborted-signal-unknown " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename ) ;
} else {
2007-04-24 19:19:14 +00:00
# ifdef HAVE_STRSIGNAL
2008-02-28 01:57:57 +00:00
if ( NULL ! = strsignal ( WTERMSIG ( proc - > exit_code ) ) ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-aborted-strsignal " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , ( unsigned long ) proc - > name . vpid , ( unsigned long ) proc - > pid ,
proc - > node - > name , WTERMSIG ( proc - > exit_code ) ,
strsignal ( WTERMSIG ( proc - > exit_code ) ) ) ;
} else {
2007-02-09 16:39:30 +00:00
# endif
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-aborted " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , ( unsigned long ) proc - > name . vpid , ( unsigned long ) proc - > pid ,
proc - > node - > name , WTERMSIG ( proc - > exit_code ) ) ;
2007-02-09 16:39:30 +00:00
# ifdef HAVE_STRSIGNAL
2006-02-16 20:40:23 +00:00
}
2008-02-28 01:57:57 +00:00
# endif
2005-04-15 21:52:58 +00:00
}
2008-03-19 19:00:51 +00:00
} else if ( ORTE_JOB_STATE_ABORTED_WO_SYNC = = job - > state ) { /* proc exited w/o finalize */
if ( NULL = = proc ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-exit-no-sync-unknown " , true ,
2008-03-19 19:00:51 +00:00
orterun_basename , orterun_basename ) ;
} else {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-exit-no-sync " , true ,
2008-03-19 19:00:51 +00:00
orterun_basename , ( unsigned long ) proc - > name . vpid , ( unsigned long ) proc - > pid ,
proc - > node - > name , orterun_basename ) ;
}
2005-04-15 21:52:58 +00:00
}
2008-02-28 01:57:57 +00:00
return ;
2005-04-28 13:18:52 +00:00
}
2005-03-31 19:39:02 +00:00
}
2008-02-28 01:57:57 +00:00
/* if we got here, then we couldn't find the job that aborted -
* report that fact and give up
*/
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:proc-aborted-unknown " , true , orterun_basename ) ;
2005-03-14 20:57:21 +00:00
}
2008-02-28 01:57:57 +00:00
static void abort_exit_callback ( int fd , short ign , void * arg )
2005-03-14 20:57:21 +00:00
{
int ret ;
2007-01-30 23:03:13 +00:00
2006-06-26 18:21:45 +00:00
if ( ! orterun_globals . quiet ) {
2006-09-14 21:29:51 +00:00
fprintf ( stderr , " %s: killing job... \n \n " , orterun_basename ) ;
2006-06-26 18:21:45 +00:00
}
2007-01-25 14:17:44 +00:00
2009-04-14 15:58:54 +00:00
/* since we are being terminated by a user's signal, be
* sure to exit with a non - zero exit code - but don ' t
* overwrite any error code from a proc that might have
* failed , in case that is why the user ordered us
* to terminate
*/
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
2006-09-14 21:29:51 +00:00
/* terminate the job - this will also wakeup orterun so
2008-02-28 01:57:57 +00:00
* it can report to the user and kill all the orteds .
* Check the jobid , though , just in case the user
* hit ctrl - c before we had a chance to setup the
* job in the system - in which case there is nothing
* to terminate !
2006-09-14 21:29:51 +00:00
*/
2009-04-30 15:08:02 +00:00
if ( NULL ! = jdata & &
jdata - > jobid ! = ORTE_JOBID_INVALID & &
! orte_never_launched ) {
/* if the debuggers were run, clean up */
orte_debugger_finalize ( ) ;
/* terminate the orteds - they will automatically kill
* their local procs
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 17:58:59 +00:00
*/
2009-04-30 15:08:02 +00:00
ret = orte_plm . terminate_orteds ( ) ;
2005-03-14 20:57:21 +00:00
if ( ORTE_SUCCESS ! = ret ) {
2009-04-30 15:08:02 +00:00
/* If we failed the terminate_orteds() above, then we
* need to just die
2008-02-28 01:57:57 +00:00
*/
2009-04-30 15:08:02 +00:00
just_quit ( fd , ign , arg ) ;
2005-03-14 20:57:21 +00:00
}
2008-06-02 21:46:34 +00:00
/* give ourselves a time limit on how long to wait
* for the job to die , just in case we can ' t make it go
* away for some reason . Don ' t send us directly back
2008-08-05 15:09:29 +00:00
* to job_completed , though , as that function may be
* what has failed
2008-06-02 21:46:34 +00:00
*/
ORTE_DETECT_TIMEOUT ( & abort_exit_event , jdata - > num_procs ,
orte_timeout_usec_per_proc ,
orte_max_timeout ,
2009-04-30 15:08:02 +00:00
just_quit ) ;
2008-06-02 21:46:34 +00:00
2008-02-28 01:57:57 +00:00
} else {
2009-04-30 15:08:02 +00:00
/* if the jobid is invalid or we never launched,
* there is nothing to do but just clean ourselves
* up and exit
2008-02-28 01:57:57 +00:00
*/
2009-04-30 15:08:02 +00:00
just_quit ( fd , ign , arg ) ;
2005-03-14 20:57:21 +00:00
}
2008-02-28 01:57:57 +00:00
}
2007-01-30 23:03:13 +00:00
2008-02-28 01:57:57 +00:00
/*
* Attempt to terminate the job and wait for callback indicating
* the job has been aborted .
*/
static void abort_signal_callback ( int fd , short flags , void * arg )
{
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 17:58:59 +00:00
/* if we have already ordered this once, don't keep
* doing it to avoid race conditions
2008-02-28 01:57:57 +00:00
*/
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 17:58:59 +00:00
if ( ! opal_atomic_trylock ( & orte_abort_inprogress_lock ) ) { /* returns 1 if already locked */
2008-06-09 13:08:54 +00:00
if ( forcibly_die ) {
/* kill any local procs */
2009-07-13 02:29:17 +00:00
orte_odls . kill_local_procs ( NULL , false ) ;
2008-06-09 13:08:54 +00:00
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup ( ORTE_JOBID_WILDCARD ) ;
/* cleanup our data server */
orte_data_server_finalize ( ) ;
/* exit with a non-zero status */
exit ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
}
fprintf ( stderr , " %s: abort is already in progress...hit ctrl-c again to forcibly terminate \n \n " , orterun_basename ) ;
forcibly_die = true ;
2008-02-28 01:57:57 +00:00
return ;
}
/* set the global abnormal exit flag so we know not to
* use the standard xcast for terminating orteds
*/
orte_abnormal_term_ordered = true ;
2009-02-27 15:01:28 +00:00
/* ensure that the forwarding of stdin stops */
orte_job_term_ordered = true ;
2008-02-28 01:57:57 +00:00
/* We are in an event handler; the job completed procedure
will delete the signal handler that is currently running
( which is a Bad Thing ) , so we can ' t call it directly .
Instead , we have to exit this handler and setup to call
job_completed ( ) after this . */
2008-12-10 20:40:47 +00:00
ORTE_TIMER_EVENT ( 0 , 0 , abort_exit_callback ) ;
2005-03-14 20:57:21 +00:00
}
2006-06-08 18:27:17 +00:00
/**
* Pass user signals to the remote application processes
*/
2006-06-26 15:12:52 +00:00
static void signal_forward_callback ( int fd , short event , void * arg )
2006-06-08 18:27:17 +00:00
{
2006-08-23 02:35:00 +00:00
struct opal_event * signal = ( struct opal_event * ) arg ;
2006-06-26 15:12:52 +00:00
int signum , ret ;
2006-06-08 18:27:17 +00:00
2006-06-26 15:12:52 +00:00
signum = OPAL_EVENT_SIGNAL ( signal ) ;
2006-06-26 18:21:45 +00:00
if ( ! orterun_globals . quiet ) {
2007-04-05 17:45:03 +00:00
fprintf ( stderr , " %s: Forwarding signal %d to job \n " ,
2006-06-26 15:12:52 +00:00
orterun_basename , signum ) ;
2006-06-26 18:21:45 +00:00
}
2006-06-08 18:27:17 +00:00
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
/** send the signal out to the processes, including any descendants */
2008-02-28 01:57:57 +00:00
if ( ORTE_SUCCESS ! = ( ret = orte_plm . signal_job ( jdata - > jobid , signum ) ) ) {
2006-06-26 15:12:52 +00:00
fprintf ( stderr , " Signal %d could not be sent to the job (returned %d) " ,
signum , ret ) ;
2006-06-08 18:27:17 +00:00
}
}
2005-09-04 20:54:19 +00:00
static int init_globals ( void )
2005-03-14 20:57:21 +00:00
{
2005-03-18 23:58:36 +00:00
/* Only CONSTRUCT things once */
if ( ! globals_init ) {
2005-07-03 22:45:48 +00:00
OBJ_CONSTRUCT ( & orterun_globals . lock , opal_mutex_t ) ;
2006-10-23 03:34:08 +00:00
orterun_globals . env_val = NULL ;
orterun_globals . appfile = NULL ;
orterun_globals . wdir = NULL ;
orterun_globals . path = NULL ;
2008-02-28 01:57:57 +00:00
orterun_globals . ompi_server = NULL ;
Per the July technical meeting:
During the discussion of MPI-2 functionality, it was pointed out by Aurelien that there was an inherent race condition between startup of ompi-server and mpirun. Specifically, if someone started ompi-server to run in the background as part of a script, and then immediately executed mpirun, it was possible that an MPI proc could attempt to contact the server (or that mpirun could try to read the server's contact file before the server is running and ready.
At that time, we discussed createing a new tool "ompi-wait-server" that would wait for the server to be running, and/or probe to see if it is running and return true/false. However, rather than create yet another tool, it seemed just as effective to add the functionality to mpirun.
Thus, this commit creates two new mpirun cmd line flags (hey, you can never have too many!):
--wait-for-server : instructs mpirun to ping the server to see if it responds. This causes mpirun to execute an rml.ping to the server's URI with an appropriate timeout interval - if the ping isn't successful, mpirun attempts it again.
--server-wait-time xx : sets the ping timeout interval to xx seconds. Note that mpirun will attempt to ping the server twice with this timeout, so we actually wait for twice this time. Default is 10 seconds, which should be plenty of time.
This has only lightly been tested. It works if the server is present, and outputs a nice error message if it cannot be contacted. I have not tested the race condition case.
This commit was SVN r19152.
2008-08-04 20:29:50 +00:00
orterun_globals . wait_for_server = false ;
orterun_globals . server_wait_timeout = 10 ;
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
orterun_globals . stdin_target = " 0 " ;
2008-12-24 15:27:46 +00:00
orterun_globals . report_pid = NULL ;
orterun_globals . report_uri = NULL ;
2005-03-18 23:58:36 +00:00
}
2006-07-10 21:25:33 +00:00
/* Reset the other fields every time */
2005-03-18 23:58:36 +00:00
2006-10-23 03:34:08 +00:00
orterun_globals . help = false ;
orterun_globals . version = false ;
orterun_globals . verbose = false ;
orterun_globals . quiet = false ;
orterun_globals . by_node = false ;
orterun_globals . by_slot = false ;
2009-08-11 02:51:27 +00:00
orterun_globals . by_board = false ;
orterun_globals . by_socket = false ;
orterun_globals . bind_to_core = false ;
orterun_globals . bind_to_board = false ;
orterun_globals . bind_to_socket = false ;
2006-10-23 03:34:08 +00:00
orterun_globals . debugger = false ;
2006-12-12 00:54:05 +00:00
orterun_globals . num_procs = 0 ;
2006-11-15 22:59:01 +00:00
if ( NULL ! = orterun_globals . env_val )
2006-10-23 03:34:08 +00:00
free ( orterun_globals . env_val ) ;
orterun_globals . env_val = NULL ;
2006-11-15 22:59:01 +00:00
if ( NULL ! = orterun_globals . appfile )
2006-10-23 03:34:08 +00:00
free ( orterun_globals . appfile ) ;
orterun_globals . appfile = NULL ;
2006-11-15 22:59:01 +00:00
if ( NULL ! = orterun_globals . wdir )
2006-10-23 03:34:08 +00:00
free ( orterun_globals . wdir ) ;
orterun_globals . wdir = NULL ;
if ( NULL ! = orterun_globals . path )
free ( orterun_globals . path ) ;
orterun_globals . path = NULL ;
2005-03-18 23:58:36 +00:00
2007-03-16 23:11:45 +00:00
orterun_globals . preload_binary = false ;
orterun_globals . preload_files = NULL ;
orterun_globals . preload_files_dest_dir = NULL ;
2005-03-18 23:58:36 +00:00
/* All done */
globals_init = true ;
2005-03-14 20:57:21 +00:00
return ORTE_SUCCESS ;
}
2007-06-27 01:03:31 +00:00
static int parse_globals ( int argc , char * argv [ ] , opal_cmd_line_t * cmd_line )
2005-03-14 20:57:21 +00:00
{
2006-06-09 17:21:23 +00:00
/* print version if requested. Do this before check for help so
that - - version - - help works as one might expect . */
2006-06-22 19:48:27 +00:00
if ( orterun_globals . version & &
! ( 1 = = argc | | orterun_globals . help ) ) {
2006-06-09 17:21:23 +00:00
char * project_name = NULL ;
if ( 0 = = strcmp ( orterun_basename , " mpirun " ) ) {
project_name = " Open MPI " ;
} else {
project_name = " OpenRTE " ;
}
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:version " , false ,
2006-06-22 19:48:27 +00:00
orterun_basename , project_name , OPAL_VERSION ,
PACKAGE_BUGREPORT ) ;
2006-06-09 17:21:23 +00:00
/* if we were the only argument, exit */
if ( 2 = = argc ) exit ( 0 ) ;
}
2005-07-28 21:17:48 +00:00
/* Check for help request */
2005-04-12 16:01:30 +00:00
if ( 1 = = argc | | orterun_globals . help ) {
2005-03-14 20:57:21 +00:00
char * args = NULL ;
2006-06-22 19:48:27 +00:00
char * project_name = NULL ;
if ( 0 = = strcmp ( orterun_basename , " mpirun " ) ) {
project_name = " Open MPI " ;
} else {
project_name = " OpenRTE " ;
}
2007-06-27 01:03:31 +00:00
args = opal_cmd_line_get_usage_msg ( cmd_line ) ;
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:usage " , false ,
2006-06-22 19:48:27 +00:00
orterun_basename , project_name , OPAL_VERSION ,
orterun_basename , args ,
PACKAGE_BUGREPORT ) ;
2005-03-14 20:57:21 +00:00
free ( args ) ;
2005-09-04 20:54:19 +00:00
2005-03-14 20:57:21 +00:00
/* If someone asks for help, that should be all we do */
exit ( 0 ) ;
}
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
/* check for request to report pid */
2008-12-24 15:27:46 +00:00
if ( NULL ! = orterun_globals . report_pid ) {
FILE * fp ;
if ( 0 = = strcmp ( orterun_globals . report_pid , " - " ) ) {
/* if '-', then output to stdout */
printf ( " %d \n " , ( int ) getpid ( ) ) ;
} else if ( 0 = = strcmp ( orterun_globals . report_pid , " + " ) ) {
/* if '+', output to stderr */
fprintf ( stderr , " %d \n " , ( int ) getpid ( ) ) ;
} else {
fp = fopen ( orterun_globals . report_pid , " w " ) ;
if ( NULL = = fp ) {
orte_show_help ( " help-orterun.txt " , " orterun:write_file " , false ,
orterun_basename , " pid " , orterun_globals . report_pid ) ;
exit ( 0 ) ;
}
fprintf ( fp , " %d \n " , ( int ) getpid ( ) ) ;
fclose ( fp ) ;
}
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 00:00:49 +00:00
}
2005-11-20 16:06:53 +00:00
/* Do we want a user-level debugger? */
2005-10-05 10:24:34 +00:00
2005-11-20 16:06:53 +00:00
if ( orterun_globals . debugger ) {
2008-06-09 20:34:14 +00:00
orte_run_debugger ( orterun_basename , cmd_line , argc , argv , orterun_globals . num_procs ) ;
2005-11-20 16:06:53 +00:00
}
2005-10-05 10:24:34 +00:00
2009-08-11 02:51:27 +00:00
/* extract any rank assignment policy directives */
if ( orterun_globals . by_node ) {
ORTE_SET_MAPPING_POLICY ( ORTE_MAPPING_BYNODE ) ;
} else if ( orterun_globals . by_board ) {
ORTE_SET_MAPPING_POLICY ( ORTE_MAPPING_BYBOARD ) ;
} else if ( orterun_globals . by_socket ) {
ORTE_SET_MAPPING_POLICY ( ORTE_MAPPING_BYSOCKET ) ;
} else {
/* byslot is the default */
ORTE_SET_MAPPING_POLICY ( ORTE_MAPPING_BYSLOT ) ;
2005-09-27 02:54:15 +00:00
}
2009-08-11 02:51:27 +00:00
2009-09-09 05:28:45 +00:00
/* extract any binding policy directives */
2009-08-11 02:51:27 +00:00
if ( orterun_globals . bind_to_socket ) {
ORTE_SET_BINDING_POLICY ( ORTE_BIND_TO_SOCKET ) ;
} else if ( orterun_globals . bind_to_board ) {
ORTE_SET_BINDING_POLICY ( ORTE_BIND_TO_BOARD ) ;
2009-08-26 02:01:49 +00:00
} else if ( orterun_globals . bind_to_core ) {
2009-08-11 02:51:27 +00:00
ORTE_SET_BINDING_POLICY ( ORTE_BIND_TO_CORE ) ;
2009-09-18 19:48:42 +00:00
} else if ( orterun_globals . bind_to_none ) {
ORTE_SET_BINDING_POLICY ( ORTE_BIND_TO_NONE ) ;
(copied from a mail that has a lengthy description of this commit)
I spoke with Tim about this the other day -- he gave me the green
light to go ahead with this, but it turned into a bigger job than I
thought it would be. I revamped how the default RAS scheduling and
round_robin RMAPS mapping occurs. The previous algorithms were pretty
brain dead, and ignored the "slots" and "max_slots" tokens in
hostfiles. I considered this a big enough problem to fix it for the
beta (because there is currently no way to control where processes are
launched on SMPs).
There's still some more bells and whistles that I'd like to implement,
but there's no hurry, and they can go on the trunk at any time. My
patches below are for what I considered "essential", and do the
following:
- honor the "slots" and "max-slots" tokens in the hostfile (and all
their synonyms), meaning that we allocate/map until we fill slots,
and if there are still more processes to allocate/map, we keep going
until we fill max-slots (i.e., only oversubscribe a node if we have
to).
- offer two different algorithms, currently supported by two new
options to orterun. Remember that there are two parts here -- slot
allocation and process mapping. Slot allocation controls how many
processes we'll be running on a node. After that decision has been
made, process mapping effectively controls where the ranks of
MPI_COMM_WORLD (MCW) are placed. Some of the examples given below
don't make sense unless you remember that there is a difference
between the two (which makes total sense, but you have to think
about it in terms of both things):
1. "-bynode": allocates/maps one process per node in a round-robin
fashion until all slots on the node are taken. If we still have more
processes after all slots are taken, then keep going until all
max-slots are taken. Examples:
- The hostfile:
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -bynode -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 2
vogon: MCW ranks 1, 3, 4, 5
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4
vogon: MCW ranks 1, 3, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until each
node's max_slots is hit, of course)
- orterun -bynode -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 2, 4, 6
vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11
2. "-byslot" (this is the default if you don't specify -bynode):
greedily takes all available slots on a node for a job before moving
on to the next node. If we still have processes to allocate/schedule,
then oversubscribe all nodes equally (i.e., go round robin on all
nodes until each node's max_slots is hit). Examples:
- The hostfile
eddie slots=2 max-slots=4
vogon slots=4 max-slots=8
- orterun -np 6 -hostfile hostfile a.out
eddie: MCW ranks 0, 1
vogon: MCW ranks 2, 3, 4, 5
- orterun -np 8 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2
vogon: MCW ranks 3, 4, 5, 6, 7
-> the algorithm oversubscribes all nodes "equally" (until max_slots
is hit)
- orterun -np 12 -hostfile hostfile a.out
eddie: MCW ranks 0, 1, 2, 3
vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11
The above examples are fairly contrived, and it's not clear from them
that you can get different allocation answers in all cases (the
mapping differences are obvious). Consider the following allocation
example:
- The hostfile
eddie count=4
vogon count=4
earth count=4
deep-thought count=4
- orterun -np 8 -hostfile hostfile a.out
eddie: 4 slots will be allocated
vogon: 4 slots will be allocated
earth: no slots allocated
deep-thought: no slots allocated
- orterun -bynode -np 8 -hostfile hostfile a.out
eddie: 2 slots will be allocated
vogon: 2 slots will be allocated
earth: 2 slots will be allocated
deep-thought: 2 slots will be allocated
This commit was SVN r5894.
2005-05-31 16:36:53 +00:00
}
2009-09-05 15:24:26 +00:00
/* if nothing was specified, leave it as set
* by mca param
*/
2009-08-11 02:51:27 +00:00
2005-03-14 20:57:21 +00:00
return ORTE_SUCCESS ;
}
static int parse_locals ( int argc , char * argv [ ] )
{
int i , rc , app_num ;
int temp_argc ;
2005-08-08 16:42:28 +00:00
char * * temp_argv , * * env ;
2005-03-14 20:57:21 +00:00
orte_app_context_t * app ;
bool made_app ;
2006-08-15 19:54:10 +00:00
orte_std_cntr_t j , size1 ;
2005-03-14 20:57:21 +00:00
2008-02-28 01:57:57 +00:00
/* if the ompi-server was given, then set it up here */
if ( NULL ! = orterun_globals . ompi_server ) {
/* someone could have passed us a file instead of a uri, so
* we need to first check to see what we have - if it starts
* with " file " , then we know it is a file . Otherwise , we assume
* it is a uri as provided by the ompi - server ' s output
* of an ORTE - standard string . Note that this is NOT a standard
* uri as it starts with the process name !
*/
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 17:53:37 +00:00
if ( 0 = = strncmp ( orterun_globals . ompi_server , " file " , strlen ( " file " ) ) | |
0 = = strncmp ( orterun_globals . ompi_server , " FILE " , strlen ( " FILE " ) ) ) {
2008-02-28 01:57:57 +00:00
char input [ 1024 ] , * filename ;
FILE * fp ;
/* it is a file - get the filename */
filename = strchr ( orterun_globals . ompi_server , ' : ' ) ;
if ( NULL = = filename ) {
/* filename is not correctly formatted */
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-filename-bad " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , orterun_globals . ompi_server ) ;
exit ( 1 ) ;
}
+ + filename ; /* space past the : */
if ( 0 > = strlen ( filename ) ) {
/* they forgot to give us the name! */
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-filename-missing " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , orterun_globals . ompi_server ) ;
exit ( 1 ) ;
}
/* open the file and extract the uri */
fp = fopen ( filename , " r " ) ;
if ( NULL = = fp ) { /* can't find or read file! */
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-filename-access " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , orterun_globals . ompi_server ) ;
exit ( 1 ) ;
}
if ( NULL = = fgets ( input , 1024 , fp ) ) {
/* something malformed about file */
fclose ( fp ) ;
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-file-bad " , true ,
2008-02-28 01:57:57 +00:00
orterun_basename , orterun_globals . ompi_server ,
orterun_basename ) ;
exit ( 1 ) ;
}
fclose ( fp ) ;
input [ strlen ( input ) - 1 ] = ' \0 ' ; /* remove newline */
ompi_server = strdup ( input ) ;
2008-12-10 17:10:39 +00:00
} else if ( 0 = = strncmp ( orterun_globals . ompi_server , " pid " , strlen ( " pid " ) ) | |
0 = = strncmp ( orterun_globals . ompi_server , " PID " , strlen ( " PID " ) ) ) {
opal_list_t hnp_list ;
opal_list_item_t * item ;
orte_hnp_contact_t * hnp ;
char * ptr ;
pid_t pid ;
ptr = strchr ( orterun_globals . ompi_server , ' : ' ) ;
if ( NULL = = ptr ) {
/* pid is not correctly formatted */
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-pid-bad " , true ,
orterun_basename , orterun_basename ,
orterun_globals . ompi_server , orterun_basename ) ;
exit ( 1 ) ;
}
+ + ptr ; /* space past the : */
if ( 0 > = strlen ( ptr ) ) {
/* they forgot to give us the pid! */
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-pid-bad " , true ,
orterun_basename , orterun_basename ,
orterun_globals . ompi_server , orterun_basename ) ;
exit ( 1 ) ;
}
pid = strtoul ( ptr , NULL , 10 ) ;
/* to search the local mpirun's, we have to partially initialize the
2009-03-05 21:56:03 +00:00
* orte_process_info structure . This won ' t fully be setup until orte_init ,
2008-12-10 17:10:39 +00:00
* but we finagle a little bit of it here
*/
2009-03-05 21:56:03 +00:00
if ( ORTE_SUCCESS ! = ( rc = orte_session_dir_get_name ( NULL , & orte_process_info . tmpdir_base ,
& orte_process_info . top_session_dir ,
2008-12-10 17:10:39 +00:00
NULL , NULL , NULL ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-could-not-get-hnp-list " , true ,
orterun_basename , orterun_basename ) ;
exit ( 1 ) ;
}
OBJ_CONSTRUCT ( & hnp_list , opal_list_t ) ;
/* get the list of HNPs, but do -not- setup contact info to them in the RML */
if ( ORTE_SUCCESS ! = ( rc = orte_list_local_hnps ( & hnp_list , false ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-could-not-get-hnp-list " , true ,
orterun_basename , orterun_basename ) ;
exit ( 1 ) ;
}
/* search the list for the desired pid */
while ( NULL ! = ( item = opal_list_remove_first ( & hnp_list ) ) ) {
hnp = ( orte_hnp_contact_t * ) item ;
if ( pid = = hnp - > pid ) {
ompi_server = strdup ( hnp - > rml_uri ) ;
goto hnp_found ;
}
OBJ_RELEASE ( item ) ;
}
/* if we got here, it wasn't found */
orte_show_help ( " help-orterun.txt " , " orterun:ompi-server-pid-not-found " , true ,
orterun_basename , orterun_basename , pid , orterun_globals . ompi_server ,
orterun_basename ) ;
OBJ_DESTRUCT ( & hnp_list ) ;
exit ( 1 ) ;
hnp_found :
/* cleanup rest of list */
while ( NULL ! = ( item = opal_list_remove_first ( & hnp_list ) ) ) {
OBJ_RELEASE ( item ) ;
}
OBJ_DESTRUCT ( & hnp_list ) ;
2008-02-28 01:57:57 +00:00
} else {
ompi_server = strdup ( orterun_globals . ompi_server ) ;
}
}
2005-03-14 20:57:21 +00:00
/* Make the apps */
temp_argc = 0 ;
temp_argv = NULL ;
2005-07-04 00:13:44 +00:00
opal_argv_append ( & temp_argc , & temp_argv , argv [ 0 ] ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
2005-08-08 16:42:28 +00:00
/* NOTE: This bogus env variable is necessary in the calls to
create_app ( ) , below . See comment immediately before the
create_app ( ) function for an explanation . */
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
env = NULL ;
2005-03-14 20:57:21 +00:00
for ( app_num = 0 , i = 1 ; i < argc ; + + i ) {
if ( 0 = = strcmp ( argv [ i ] , " : " ) ) {
/* Make an app with this argv */
2005-07-04 00:13:44 +00:00
if ( opal_argv_count ( temp_argv ) > 1 ) {
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
if ( NULL ! = env ) {
2005-07-04 00:13:44 +00:00
opal_argv_free ( env ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
env = NULL ;
}
2006-03-24 15:28:42 +00:00
app = NULL ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
rc = create_app ( temp_argc , temp_argv , & app , & made_app , & env ) ;
2006-03-23 16:53:11 +00:00
/** keep track of the number of apps - point this app_context to that index */
2005-03-14 20:57:21 +00:00
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been
printed ; no need to cleanup - - we can just
exit */
exit ( 1 ) ;
}
if ( made_app ) {
2006-03-24 15:28:42 +00:00
app - > idx = app_num ;
+ + app_num ;
2008-02-28 05:32:23 +00:00
opal_pointer_array_add ( jdata - > apps , app ) ;
2008-02-28 01:57:57 +00:00
+ + jdata - > num_apps ;
2005-03-14 20:57:21 +00:00
}
2005-09-04 20:54:19 +00:00
2005-03-14 20:57:21 +00:00
/* Reset the temps */
2005-09-04 20:54:19 +00:00
2005-03-14 20:57:21 +00:00
temp_argc = 0 ;
temp_argv = NULL ;
2005-07-04 00:13:44 +00:00
opal_argv_append ( & temp_argc , & temp_argv , argv [ 0 ] ) ;
2005-03-14 20:57:21 +00:00
}
} else {
2005-07-04 00:13:44 +00:00
opal_argv_append ( & temp_argc , & temp_argv , argv [ i ] ) ;
2005-03-14 20:57:21 +00:00
}
}
2005-07-04 00:13:44 +00:00
if ( opal_argv_count ( temp_argv ) > 1 ) {
2006-03-24 15:28:42 +00:00
app = NULL ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
rc = create_app ( temp_argc , temp_argv , & app , & made_app , & env ) ;
2005-03-14 20:57:21 +00:00
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been printed;
no need to cleanup - - we can just exit */
exit ( 1 ) ;
}
if ( made_app ) {
2006-03-24 15:28:42 +00:00
app - > idx = app_num ;
+ + app_num ;
2008-02-28 05:32:23 +00:00
opal_pointer_array_add ( jdata - > apps , app ) ;
2008-02-28 01:57:57 +00:00
+ + jdata - > num_apps ;
2005-03-14 20:57:21 +00:00
}
}
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
if ( NULL ! = env ) {
2005-07-04 00:13:44 +00:00
opal_argv_free ( env ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
}
2005-07-04 00:13:44 +00:00
opal_argv_free ( temp_argv ) ;
2005-03-14 20:57:21 +00:00
2005-08-08 16:42:28 +00:00
/* Once we've created all the apps, add the global MCA params to
each app ' s environment ( checking for duplicates , of
course - - yay opal_environ_merge ( ) ) . */
if ( NULL ! = global_mca_env ) {
2008-02-28 05:32:23 +00:00
size1 = ( size_t ) opal_pointer_array_get_size ( jdata - > apps ) ;
2005-08-08 16:42:28 +00:00
/* Iterate through all the apps */
for ( j = 0 ; j < size1 ; + + j ) {
2005-09-04 20:54:19 +00:00
app = ( orte_app_context_t * )
2008-02-28 05:32:23 +00:00
opal_pointer_array_get_item ( jdata - > apps , j ) ;
2005-08-08 16:42:28 +00:00
if ( NULL ! = app ) {
/* Use handy utility function */
env = opal_environ_merge ( global_mca_env , app - > env ) ;
opal_argv_free ( app - > env ) ;
app - > env = env ;
}
}
}
/* Now take a subset of the MCA params and set them as MCA
overrides here in orterun ( so that when we orte_init ( ) later ,
all the components see these MCA params ) . Here ' s how we decide
which subset of the MCA params we set here in orterun :
1. If any global MCA params were set , use those
2. If no global MCA params were set and there was only one app ,
then use its app MCA params
3. Otherwise , don ' t set any
*/
env = NULL ;
if ( NULL ! = global_mca_env ) {
env = global_mca_env ;
} else {
2008-02-28 05:32:23 +00:00
if ( opal_pointer_array_get_size ( jdata - > apps ) > = 1 ) {
2005-08-08 16:42:28 +00:00
/* Remember that pointer_array's can be padded with NULL
entries ; so only use the app ' s env if there is exactly
1 non - NULL entry */
2005-09-04 20:54:19 +00:00
app = ( orte_app_context_t * )
2008-02-28 05:32:23 +00:00
opal_pointer_array_get_item ( jdata - > apps , 0 ) ;
2005-08-08 16:42:28 +00:00
if ( NULL ! = app ) {
env = app - > env ;
2008-02-28 05:32:23 +00:00
for ( j = 1 ; j < opal_pointer_array_get_size ( jdata - > apps ) ; + + j ) {
if ( NULL ! = opal_pointer_array_get_item ( jdata - > apps , j ) ) {
2005-08-08 16:42:28 +00:00
env = NULL ;
break ;
}
}
}
}
}
2005-09-04 20:54:19 +00:00
2005-08-08 16:42:28 +00:00
if ( NULL ! = env ) {
size1 = opal_argv_count ( env ) ;
for ( j = 0 ; j < size1 ; + + j ) {
putenv ( env [ j ] ) ;
}
}
2005-03-14 20:57:21 +00:00
/* All done */
return ORTE_SUCCESS ;
}
2008-07-08 22:36:39 +00:00
static int capture_cmd_line_params ( int argc , int start , char * * argv )
{
int i , j , k ;
bool ignore ;
char * no_dups [ ] = {
" grpcomm " ,
" odls " ,
" rml " ,
" routed " ,
NULL
} ;
for ( i = 0 ; i < ( argc - start ) ; + + i ) {
if ( 0 = = strcmp ( " -mca " , argv [ i ] ) | |
0 = = strcmp ( " --mca " , argv [ i ] ) ) {
/* It would be nice to avoid increasing the length
* of the orted cmd line by removing any non - ORTE
* params . However , this raises a problem since
* there could be OPAL directives that we really
* - do - want the orted to see - it ' s only the OMPI
* related directives we could ignore . This becomes
* a very complicated procedure , however , since
* the OMPI mca params are not cleanly separated - so
* filtering them out is nearly impossible .
*
* see if this is already present so we at least can
* avoid growing the cmd line with duplicates
*/
ignore = false ;
if ( NULL ! = orted_cmd_line ) {
for ( j = 0 ; NULL ! = orted_cmd_line [ j ] ; j + + ) {
if ( 0 = = strcmp ( argv [ i + 1 ] , orted_cmd_line [ j ] ) ) {
/* already here - if the value is the same,
* we can quitely ignore the fact that they
* provide it more than once . However , some
* frameworks are known to have problems if the
* value is different . We don ' t have a good way
* to know this , but we at least make a crude
* attempt here to protect ourselves .
*/
if ( 0 = = strcmp ( argv [ i + 2 ] , orted_cmd_line [ j + 1 ] ) ) {
/* values are the same */
ignore = true ;
break ;
} else {
/* values are different - see if this is a problem */
for ( k = 0 ; NULL ! = no_dups [ k ] ; k + + ) {
if ( 0 = = strcmp ( no_dups [ k ] , argv [ i + 1 ] ) ) {
/* print help message
* and abort as we cannot know which one is correct
*/
orte_show_help ( " help-orterun.txt " , " orterun:conflicting-params " ,
true , orterun_basename , argv [ i + 1 ] ,
argv [ i + 2 ] , orted_cmd_line [ j + 1 ] ) ;
return ORTE_ERR_BAD_PARAM ;
}
}
/* this passed muster - just ignore it */
ignore = true ;
break ;
}
}
}
}
if ( ! ignore ) {
opal_argv_append_nosize ( & orted_cmd_line , argv [ i ] ) ;
opal_argv_append_nosize ( & orted_cmd_line , argv [ i + 1 ] ) ;
opal_argv_append_nosize ( & orted_cmd_line , argv [ i + 2 ] ) ;
}
i + = 2 ;
}
}
return ORTE_SUCCESS ;
}
2005-08-08 16:42:28 +00:00
/*
* This function takes a " char ***app_env " parameter to handle the
* specific case :
*
* orterun - - mca foo bar - app appfile
*
* That is , we ' ll need to keep foo = bar , but the presence of the app
* file will cause an invocation of parse_appfile ( ) , which will cause
* one or more recursive calls back to create_app ( ) . Since the
* foo = bar value applies globally to all apps in the appfile , we need
* to pass in the " base " environment ( that contains the foo = bar value )
* when we parse each line in the appfile .
*
* This is really just a special case - - when we have a simple case like :
*
* orterun - - mca foo bar - np 4 hostname
*
* Then the upper - level function ( parse_locals ( ) ) calls create_app ( )
* with a NULL value for app_env , meaning that there is no " base "
* environment that the app needs to be created from .
*/
2005-03-14 20:57:21 +00:00
static int create_app ( int argc , char * argv [ ] , orte_app_context_t * * app_ptr ,
2005-08-08 16:42:28 +00:00
bool * made_app , char * * * app_env )
2005-03-14 20:57:21 +00:00
{
2005-07-04 00:13:44 +00:00
opal_cmd_line_t cmd_line ;
2009-05-06 20:11:28 +00:00
char cwd [ OPAL_PATH_MAX ] ;
2006-02-07 03:32:36 +00:00
int i , j , count , rc ;
2005-03-14 20:57:21 +00:00
char * param , * value , * value2 ;
orte_app_context_t * app = NULL ;
2008-03-05 22:12:27 +00:00
bool cmd_line_made = false ;
2005-03-14 20:57:21 +00:00
* made_app = false ;
2008-03-05 22:12:27 +00:00
/* Pre-process the command line if we are going to parse an appfile later.
* save any mca command line args so they can be passed
* separately to the daemons .
* Use Case :
* $ cat launch . appfile
* - np 1 - mca aaa bbb . / my - app - mca ccc ddd
* - np 1 - mca aaa bbb . / my - app - mca eee fff
* $ mpirun - np 2 - mca foo bar - - app launch . appfile
* Only pick up ' - mca foo bar ' on this pass .
2005-11-03 18:15:47 +00:00
*/
2008-03-05 22:12:27 +00:00
if ( NULL ! = orterun_globals . appfile ) {
2008-07-08 22:36:39 +00:00
if ( ORTE_SUCCESS ! = ( rc = capture_cmd_line_params ( argc , 0 , argv ) ) ) {
goto cleanup ;
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 19:48:23 +00:00
}
2005-03-14 20:57:21 +00:00
}
2008-07-08 22:36:39 +00:00
2008-03-05 22:12:27 +00:00
/* Parse application command line options. */
2005-03-14 20:57:21 +00:00
init_globals ( ) ;
2005-07-04 00:13:44 +00:00
opal_cmd_line_create ( & cmd_line , cmd_line_init ) ;
2005-03-18 03:43:59 +00:00
mca_base_cmd_line_setup ( & cmd_line ) ;
2005-03-14 20:57:21 +00:00
cmd_line_made = true ;
2008-03-05 22:12:27 +00:00
rc = opal_cmd_line_parse ( & cmd_line , true , argc , argv ) ;
2006-02-12 01:33:29 +00:00
if ( ORTE_SUCCESS ! = rc ) {
2005-03-14 20:57:21 +00:00
goto cleanup ;
}
2005-08-08 16:42:28 +00:00
mca_base_cmd_line_process_args ( & cmd_line , app_env , & global_mca_env ) ;
2005-03-14 20:57:21 +00:00
/* Is there an appfile in here? */
if ( NULL ! = orterun_globals . appfile ) {
OBJ_DESTRUCT ( & cmd_line ) ;
2005-08-08 16:42:28 +00:00
return parse_appfile ( strdup ( orterun_globals . appfile ) , app_env ) ;
2005-03-14 20:57:21 +00:00
}
/* Setup application context */
app = OBJ_NEW ( orte_app_context_t ) ;
2006-02-07 03:32:36 +00:00
opal_cmd_line_get_tail ( & cmd_line , & count , & app - > argv ) ;
2005-03-14 20:57:21 +00:00
/* See if we have anything left */
2006-02-07 03:32:36 +00:00
if ( 0 = = count ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:executable-not-specified " ,
2005-04-12 16:01:30 +00:00
true , orterun_basename , orterun_basename ) ;
2005-03-14 20:57:21 +00:00
rc = ORTE_ERR_NOT_FOUND ;
goto cleanup ;
}
2007-10-11 18:33:40 +00:00
/*
2008-03-05 22:12:27 +00:00
* Get mca parameters so we can pass them to the daemons .
* Use the count determined above to make sure we do not go past
* the executable name . Example :
2007-10-11 18:33:40 +00:00
* mpirun - np 2 - mca foo bar . / my - app - mca bip bop
* We want to pick up ' - mca foo bar ' but not ' - mca bip bop '
*/
2008-07-08 22:36:39 +00:00
if ( ORTE_SUCCESS ! = ( rc = capture_cmd_line_params ( argc , count , argv ) ) ) {
goto cleanup ;
2007-10-11 18:33:40 +00:00
}
2008-07-08 22:36:39 +00:00
2005-04-09 01:26:17 +00:00
/* Grab all OMPI_* environment variables */
2005-03-14 20:57:21 +00:00
2005-08-08 16:42:28 +00:00
app - > env = opal_argv_copy ( * app_env ) ;
2005-03-14 20:57:21 +00:00
for ( i = 0 ; NULL ! = environ [ i ] ; + + i ) {
2005-04-06 01:58:30 +00:00
if ( 0 = = strncmp ( " OMPI_ " , environ [ i ] , 5 ) ) {
2008-07-08 13:48:47 +00:00
/* check for duplicate in app->env - this
* would have been placed there by the
* cmd line processor . By convention , we
* always let the cmd line override the
* environment
*/
param = strdup ( environ [ i ] ) ;
value = strchr ( param , ' = ' ) ;
* value = ' \0 ' ;
value + + ;
opal_setenv ( param , value , false , & app - > env ) ;
free ( param ) ;
2005-03-14 20:57:21 +00:00
}
}
2008-12-09 23:49:02 +00:00
/* if profile was set, add it back in */
if ( profile_is_set ) {
opal_setenv ( " OMPI_MCA_opal_profile " , " 1 " , true , & app - > env ) ;
}
2008-02-28 01:57:57 +00:00
/* add the ompi-server, if provided */
if ( NULL ! = ompi_server ) {
2008-12-09 23:49:02 +00:00
opal_setenv ( " OMPI_MCA_pubsub_orte_server " , ompi_server , true , & app - > env ) ;
2008-02-28 01:57:57 +00:00
}
2005-03-14 20:57:21 +00:00
/* Did the user request to export any environment variables? */
2005-07-04 00:13:44 +00:00
if ( opal_cmd_line_is_taken ( & cmd_line , " x " ) ) {
j = opal_cmd_line_get_ninsts ( & cmd_line , " x " ) ;
2005-03-14 20:57:21 +00:00
for ( i = 0 ; i < j ; + + i ) {
2005-07-04 00:13:44 +00:00
param = opal_cmd_line_get_param ( & cmd_line , " x " , i , 0 ) ;
2005-03-14 20:57:21 +00:00
if ( NULL ! = strchr ( param , ' = ' ) ) {
2005-07-04 00:13:44 +00:00
opal_argv_append_nosize ( & app - > env , param ) ;
2005-03-14 20:57:21 +00:00
} else {
value = getenv ( param ) ;
if ( NULL ! = value ) {
if ( NULL ! = strchr ( value , ' = ' ) ) {
2005-07-04 00:13:44 +00:00
opal_argv_append_nosize ( & app - > env , value ) ;
2005-03-14 20:57:21 +00:00
} else {
asprintf ( & value2 , " %s=%s " , param , value ) ;
2005-07-04 00:13:44 +00:00
opal_argv_append_nosize ( & app - > env , value2 ) ;
2005-05-12 21:44:23 +00:00
free ( value2 ) ;
2005-03-14 20:57:21 +00:00
}
} else {
2008-06-09 14:53:58 +00:00
opal_output ( 0 , " Warning: could not find environment variable \" %s \" \n " , param ) ;
2005-03-14 20:57:21 +00:00
}
}
}
}
2008-03-05 21:07:43 +00:00
/* If the user specified --path, store it in the user's app
environment via the OMPI_exec_path variable . */
2005-03-14 20:57:21 +00:00
if ( NULL ! = orterun_globals . path ) {
2008-03-05 21:07:43 +00:00
asprintf ( & value , " OMPI_exec_path=%s " , orterun_globals . path ) ;
2005-07-04 00:13:44 +00:00
opal_argv_append_nosize ( & app - > env , value ) ;
2005-03-14 20:57:21 +00:00
free ( value ) ;
}
/* Did the user request a specific wdir? */
if ( NULL ! = orterun_globals . wdir ) {
2009-01-25 12:39:24 +00:00
/* if this is a relative path, convert it to an absolute path */
if ( opal_path_is_absolute ( orterun_globals . wdir ) ) {
app - > cwd = strdup ( orterun_globals . wdir ) ;
} else {
/* get the cwd */
if ( OPAL_SUCCESS ! = ( rc = opal_getcwd ( cwd , sizeof ( cwd ) ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:init-failure " ,
true , " get the cwd " , rc ) ;
goto cleanup ;
}
/* construct the absolute path */
app - > cwd = opal_os_path ( false , cwd , orterun_globals . wdir , NULL ) ;
}
2006-02-16 20:40:23 +00:00
app - > user_specified_cwd = true ;
2005-03-14 20:57:21 +00:00
} else {
2008-02-28 01:57:57 +00:00
if ( OPAL_SUCCESS ! = ( rc = opal_getcwd ( cwd , sizeof ( cwd ) ) ) ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:init-failure " ,
2008-02-28 01:57:57 +00:00
true , " get the cwd " , rc ) ;
goto cleanup ;
}
2005-03-14 20:57:21 +00:00
app - > cwd = strdup ( cwd ) ;
2006-02-16 20:40:23 +00:00
app - > user_specified_cwd = false ;
2005-03-14 20:57:21 +00:00
}
2006-09-15 02:52:08 +00:00
/* Check to see if the user explicitly wanted to disable automatic
- - prefix behavior */
if ( opal_cmd_line_is_taken ( & cmd_line , " noprefix " ) ) {
want_prefix_by_default = false ;
}
2006-02-28 11:52:12 +00:00
/* Did the user specify a specific prefix for this app_context_t
or provide an absolute path name to argv [ 0 ] ? */
if ( opal_cmd_line_is_taken ( & cmd_line , " prefix " ) | |
2006-09-15 02:52:08 +00:00
' / ' = = argv [ 0 ] [ 0 ] | | want_prefix_by_default ) {
2005-09-06 16:10:05 +00:00
size_t param_len ;
2006-02-28 14:44:40 +00:00
/* The --prefix option takes precedence over /path/to/orterun */
if ( opal_cmd_line_is_taken ( & cmd_line , " prefix " ) ) {
param = opal_cmd_line_get_param ( & cmd_line , " prefix " , 0 , 0 ) ;
2006-09-15 02:52:08 +00:00
}
/* /path/to/orterun */
else if ( ' / ' = = argv [ 0 ] [ 0 ] ) {
2006-08-23 02:35:00 +00:00
char * tmp_basename = NULL ;
2006-02-28 14:44:40 +00:00
/* If they specified an absolute path, strip off the
/ bin / < exec_name > " and leave just the prefix */
2006-08-23 02:35:00 +00:00
param = opal_dirname ( argv [ 0 ] ) ;
2006-02-28 11:52:12 +00:00
/* Quick sanity check to ensure we got
something / bin / < exec_name > and that the installation
tree is at least more or less what we expect it to
be */
2006-08-23 02:35:00 +00:00
tmp_basename = opal_basename ( param ) ;
if ( 0 = = strcmp ( " bin " , tmp_basename ) ) {
char * tmp = param ;
param = opal_dirname ( tmp ) ;
free ( tmp ) ;
2006-02-28 11:52:12 +00:00
} else {
free ( param ) ;
param = NULL ;
2005-09-06 16:10:05 +00:00
}
2006-08-23 02:35:00 +00:00
free ( tmp_basename ) ;
2005-09-06 16:10:05 +00:00
}
2006-09-15 02:52:08 +00:00
/* --enable-orterun-prefix-default was given to orterun */
else {
2008-06-11 14:42:47 +00:00
param = opal_install_dirs . prefix ;
2006-09-15 02:52:08 +00:00
}
2005-09-06 16:10:05 +00:00
2006-02-28 11:52:12 +00:00
if ( NULL ! = param ) {
2006-08-24 16:18:42 +00:00
/* "Parse" the param, aka remove superfluous path_sep. */
2006-02-28 11:52:12 +00:00
param_len = strlen ( param ) ;
2006-08-21 21:55:41 +00:00
while ( 0 = = strcmp ( OPAL_PATH_SEP , & ( param [ param_len - 1 ] ) ) ) {
2006-02-28 11:52:12 +00:00
param [ param_len - 1 ] = ' \0 ' ;
param_len - - ;
if ( 0 = = param_len ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:empty-prefix " ,
2006-02-28 11:52:12 +00:00
true , orterun_basename , orterun_basename ) ;
return ORTE_ERR_FATAL ;
}
}
app - > prefix_dir = strdup ( param ) ;
}
2005-09-06 16:10:05 +00:00
}
2008-03-05 22:12:27 +00:00
/* Did the user specify a hostfile. Need to check for both
* hostfile and machine file .
* We can only deal with one hostfile per app context , otherwise give an error .
2008-02-28 01:57:57 +00:00
*/
2008-03-05 22:12:27 +00:00
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " hostfile " ) ) ) {
if ( 1 < j ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:multiple-hostfiles " ,
2008-03-05 22:12:27 +00:00
true , orterun_basename , NULL ) ;
return ORTE_ERR_FATAL ;
} else {
value = opal_cmd_line_get_param ( & cmd_line , " hostfile " , 0 , 0 ) ;
app - > hostfile = strdup ( value ) ;
2005-03-14 20:57:21 +00:00
}
2008-03-05 22:12:27 +00:00
}
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " machinefile " ) ) ) {
if ( 1 < j | | NULL ! = app - > hostfile ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:multiple-hostfiles " ,
2008-03-05 22:12:27 +00:00
true , orterun_basename , NULL ) ;
return ORTE_ERR_FATAL ;
} else {
value = opal_cmd_line_get_param ( & cmd_line , " machinefile " , 0 , 0 ) ;
app - > hostfile = strdup ( value ) ;
2005-03-14 20:57:21 +00:00
}
2008-03-05 22:12:27 +00:00
}
/* Did the user specify any hosts? */
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " host " ) ) ) {
2005-03-14 20:57:21 +00:00
for ( i = 0 ; i < j ; + + i ) {
2008-03-05 22:12:27 +00:00
value = opal_cmd_line_get_param ( & cmd_line , " host " , i , 0 ) ;
opal_argv_append_nosize ( & app - > dash_host , value ) ;
2005-03-14 20:57:21 +00:00
}
}
/* Get the numprocs */
2006-09-25 19:41:54 +00:00
app - > num_procs = ( orte_std_cntr_t ) orterun_globals . num_procs ;
2005-04-09 01:26:17 +00:00
2006-07-10 21:25:33 +00:00
/* If the user didn't specify the number of processes to run, then we
default to launching an app process using every slot . We can ' t do
anything about that here - we leave it to the RMAPS framework ' s
components to note this and deal with it later .
HOWEVER , we ONLY support this mode of operation if the number of
app_contexts is equal to ONE . If the user provides multiple applications ,
we simply must have more information - in this case , generate an
error .
*/
if ( app - > num_procs = = 0 ) {
have_zero_np = true ; /** flag that we have a zero_np situation */
2005-03-14 20:57:21 +00:00
}
2007-03-16 23:11:45 +00:00
2006-07-10 21:25:33 +00:00
if ( 0 < total_num_apps & & have_zero_np ) {
/** we have more than one app and a zero_np - that's no good.
* note that we have to do this as a two step logic check since
* the user may fail to specify num_procs for the first app , but
* then give us another application .
*/
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:multi-apps-and-zero-np " ,
2006-07-10 21:25:33 +00:00
true , orterun_basename , NULL ) ;
return ORTE_ERR_FATAL ;
}
total_num_apps + + ;
2007-03-16 23:11:45 +00:00
/* Preserve if we are to preload the binary */
app - > preload_binary = orterun_globals . preload_binary ;
if ( NULL ! = orterun_globals . preload_files )
app - > preload_files = strdup ( orterun_globals . preload_files ) ;
else
app - > preload_files = NULL ;
if ( NULL ! = orterun_globals . preload_files_dest_dir )
app - > preload_files_dest_dir = strdup ( orterun_globals . preload_files_dest_dir ) ;
else
app - > preload_files_dest_dir = NULL ;
2006-02-16 20:40:23 +00:00
/* Do not try to find argv[0] here -- the starter is responsible
for that because it may not be relevant to try to find it on
the node where orterun is executing . So just strdup ( ) argv [ 0 ]
into app . */
2005-03-14 20:57:21 +00:00
2006-02-16 20:40:23 +00:00
app - > app = strdup ( app - > argv [ 0 ] ) ;
2005-03-14 20:57:21 +00:00
if ( NULL = = app - > app ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:call-failed " ,
2006-02-16 20:40:23 +00:00
true , orterun_basename , " library " , " strdup returned NULL " , errno ) ;
2005-03-14 20:57:21 +00:00
rc = ORTE_ERR_NOT_FOUND ;
goto cleanup ;
}
* app_ptr = app ;
app = NULL ;
* made_app = true ;
/* All done */
cleanup :
if ( NULL ! = app ) {
OBJ_RELEASE ( app ) ;
}
if ( cmd_line_made ) {
OBJ_DESTRUCT ( & cmd_line ) ;
}
return rc ;
}
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
static int parse_appfile ( char * filename , char * * * env )
2005-03-14 20:57:21 +00:00
{
size_t i , len ;
FILE * fp ;
char line [ BUFSIZ ] ;
2006-03-23 17:55:25 +00:00
int rc , argc , app_num ;
2005-03-14 20:57:21 +00:00
char * * argv ;
orte_app_context_t * app ;
bool blank , made_app ;
char bogus [ ] = " bogus " ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
char * * tmp_env ;
2005-03-14 20:57:21 +00:00
2007-10-11 18:33:40 +00:00
/*
* Make sure to clear out this variable so we don ' t do anything odd in
* app_create ( )
*/
if ( NULL ! = orterun_globals . appfile ) {
free ( orterun_globals . appfile ) ;
orterun_globals . appfile = NULL ;
}
2005-03-14 20:57:21 +00:00
/* Try to open the file */
fp = fopen ( filename , " r " ) ;
if ( NULL = = fp ) {
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-13 20:00:55 +00:00
orte_show_help ( " help-orterun.txt " , " orterun:appfile-not-found " , true ,
2005-03-14 20:57:21 +00:00
filename ) ;
return ORTE_ERR_NOT_FOUND ;
}
/* Read in line by line */
line [ sizeof ( line ) - 1 ] = ' \0 ' ;
2006-03-23 17:55:25 +00:00
app_num = 0 ;
2005-03-14 20:57:21 +00:00
do {
/* We need a bogus argv[0] (because when argv comes in from
the command line , argv [ 0 ] is " orterun " , so the parsing
logic ignores it ) . So create one here rather than making
an argv and then pre - pending a new argv [ 0 ] ( which would be
rather inefficient ) . */
line [ 0 ] = ' \0 ' ;
strcat ( line , bogus ) ;
2005-09-04 20:54:19 +00:00
if ( NULL = = fgets ( line + sizeof ( bogus ) - 1 ,
2005-03-14 20:57:21 +00:00
sizeof ( line ) - sizeof ( bogus ) - 1 , fp ) ) {
break ;
}
2005-04-12 18:42:34 +00:00
/* Remove a trailing newline */
2005-03-14 20:57:21 +00:00
len = strlen ( line ) ;
2005-04-12 18:42:34 +00:00
if ( len > 0 & & ' \n ' = = line [ len - 1 ] ) {
line [ len - 1 ] = ' \0 ' ;
if ( len > 0 ) {
- - len ;
}
}
/* Remove comments */
2005-03-14 20:57:21 +00:00
for ( i = 0 ; i < len ; + + i ) {
if ( ' # ' = = line [ i ] ) {
line [ i ] = ' \0 ' ;
break ;
} else if ( i + 1 < len & & ' / ' = = line [ i ] & & ' / ' = = line [ i + 1 ] ) {
line [ i ] = ' \0 ' ;
break ;
}
}
/* Is this a blank line? */
len = strlen ( line ) ;
for ( blank = true , i = sizeof ( bogus ) ; i < len ; + + i ) {
if ( ! isspace ( line [ i ] ) ) {
blank = false ;
break ;
}
}
if ( blank ) {
continue ;
}
/* We got a line with *something* on it. So process it */
2005-07-04 00:13:44 +00:00
argv = opal_argv_split ( line , ' ' ) ;
argc = opal_argv_count ( argv ) ;
2005-03-14 20:57:21 +00:00
if ( argc > 0 ) {
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
2005-08-08 16:42:28 +00:00
/* Create a temporary env to use in the recursive call --
that is : don ' t disturb the original env so that we can
have a consistent global env . This allows for the
case :
2005-09-04 20:54:19 +00:00
orterun - - mca foo bar - - appfile file
2005-08-08 16:42:28 +00:00
where the " file " contains multiple apps . In this case ,
each app in " file " will get * only * foo = bar as the base
environment from which its specific environment is
constructed . */
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
if ( NULL ! = * env ) {
2005-07-04 00:13:44 +00:00
tmp_env = opal_argv_copy ( * env ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
if ( NULL = = tmp_env ) {
return ORTE_ERR_OUT_OF_RESOURCE ;
}
} else {
tmp_env = NULL ;
}
rc = create_app ( argc , argv , & app , & made_app , & tmp_env ) ;
2005-03-14 20:57:21 +00:00
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been
printed ; no need to cleanup - - we can just exit */
exit ( 1 ) ;
}
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
if ( NULL ! = tmp_env ) {
2005-07-04 00:13:44 +00:00
opal_argv_free ( tmp_env ) ;
While waiting for fortran compiles...
Fixes for orterun in handling different MCA params for different
processes (reviewed by Brian):
- By design, if you run the following:
mpirun --mca foo aaa --mca foo bbb a.out
a.out will get a single MCA param for foo with value "aaa,bbb".
- However, if you specify multiple apps with different values for the
same MCA param, you should expect to get the different values for
each app. For example:
mpirun --mca foo aaa a.out : --mca foo bbb b.out
Should yield a.out with a "foo" param with value "aaa" and b.out
with a "foo" param with a value "bbb".
- This did not work -- both a.out and b.out would get a "foo" with
"aaa,bbb".
- This commit fixes this behavior -- now a.out will get aaa and b.out
will get bbb.
- Additionally, if you mix --mca and and app file, you can have
"global" params and per-line-in-the-appfile params. For example:
mpirun --mca foo zzzz --app appfile
where "appfile" contains:
-np 1 --mca bar aaa a.out
-np 1 --mca bar bbb b.out
In this case, a.out will get foo=zzzz and bar=aaa, and b.out will
get foo=zzzz and bar=bbb.
Spiffy.
Ok, fortran build is done... back to Fortran... sigh...
This commit was SVN r5710.
2005-05-13 14:36:36 +00:00
}
2005-03-14 20:57:21 +00:00
if ( made_app ) {
2006-03-24 15:28:42 +00:00
app - > idx = app_num ;
+ + app_num ;
2008-02-28 05:32:23 +00:00
opal_pointer_array_add ( jdata - > apps , app ) ;
2008-02-28 01:57:57 +00:00
+ + jdata - > num_apps ;
2005-03-14 20:57:21 +00:00
}
}
} while ( ! feof ( fp ) ) ;
fclose ( fp ) ;
/* All done */
free ( filename ) ;
return ORTE_SUCCESS ;
}